In [1]:
import requests
import pandas as pd

In [2]:
def fetch_json(url):
    #Fetch JSON data from NOAA and convert to DataFrame
    response = requests.get(url, timeout=10)
    #response.raise_for_status()
    data = response.json()
    return pd.DataFrame(data[1:], columns=data[0])

#url's to fetch data from
plasma_url = 'https://services.swpc.noaa.gov/products/solar-wind/plasma-7-day.json'
mag_url = 'https://services.swpc.noaa.gov/products/solar-wind/mag-7-day.json'
sun_url = "https://services.swpc.noaa.gov/json/solar-cycle/swpc_observed_ssn.json"
kp_ind_url = 'https://services.swpc.noaa.gov/products/noaa-planetary-k-index.json'
dst_url = 'https://services.swpc.noaa.gov/products/kyoto-dst.json'

# Fetch data
try:
    plasma_df = fetch_json(plasma_url)
    mag_df = fetch_json(mag_url)   
    sun_df = fetch_json(sun_url)
    kp_df = fetch_json(kp_ind_url)
    dst_df = fetch_json(dst_url)
    
    plasma_df.to_csv('../data/raw/plasma.csv', index=False)
    mag_df.to_csv('../data/raw/mag.csv', index=False)
    sun_df.to_csv('../data/raw/sun.csv', index=False)
    kp_df.to_csv('../data/raw/kp_ind.csv', index=False)
    dst_df.to_csv('../data/raw/dst_ind.csv', index=False)
    
except Exception as e:
    print(f'An error occured when retrieving the data: {e}')
    print('Using local cached files')
    
    try:
        plasma_df = pd.read_csv('plasma.csv')
        mag_df = pd.read_csv('mag.csv')
        sun_df = pd.read_csv('sun.csv')
        kp_df = pd.read_csv('kp_ind.csv')
        dst_df = pd.read_csv('dst_ind.csv')

    except FileNotFoundError:
        print('Local files not found')
        
    except Exception as e:
        print(f'An unexpected error occured when retrieving local files: {e}')
    

In [3]:
print(len(mag_df))
print(len(plasma_df))

9516
9376


# Drop unimportant columns

In [4]:
mag_df.drop(columns=['lon_gsm', 'lat_gsm'], inplace=True)

# Join mag and plasma dataframes

In [5]:
mag_df['time_tag'] = pd.to_datetime(mag_df['time_tag'])
plasma_df['time_tag'] = pd.to_datetime(plasma_df['time_tag'])

In [6]:
mag_df.set_index('time_tag', inplace=True)
plasma_df.set_index('time_tag', inplace=True)

In [7]:
start_time = min(mag_df.index.min(), plasma_df.index.min())
end_time = max(mag_df.index.max(), plasma_df.index.max())

full_range = pd.date_range(start=start_time, end=end_time, freq='min')

In [8]:
mag_df = mag_df.reindex(full_range)
plasma_df = plasma_df.reindex(full_range)

In [9]:
#join mag+plasma
solar_df = plasma_df.join(mag_df, how='inner')
#solar_df

# Fill missing values

In [10]:
nans = solar_df.isna().sum()
nans/len(solar_df)

density        0.069841
speed          0.073611
temperature    0.073710
bx_gsm         0.055952
by_gsm         0.055952
bz_gsm         0.055952
bt             0.055952
dtype: float64

In [11]:
solar_df = solar_df.astype('float64')

In [12]:
solar_df = solar_df.interpolate(method = 'linear', axis = 0).ffill().bfill()

# Feature Engineering

In [13]:
proton_mass = 1.6726e-27
solar_df['pressure'] = (
    proton_mass *
    solar_df["density"] * 1e6 *
    solar_df["speed"] ** 2 * 1e6
    * 1e9 #to convert to nano pascals
    )

In [14]:
solar_df

Unnamed: 0,density,speed,temperature,bx_gsm,by_gsm,bz_gsm,bt,pressure
2025-11-21 15:50:00,5.05,422.1,68334.0,-5.06,-8.81,0.33,10.17,1.504923
2025-11-21 15:51:00,5.05,422.1,68334.0,-4.57,-9.02,-0.06,10.11,1.504923
2025-11-21 15:52:00,5.01,417.9,73528.0,-3.54,-9.30,1.55,10.07,1.463439
2025-11-21 15:53:00,4.64,416.7,75158.0,-3.91,-9.31,0.54,10.11,1.347588
2025-11-21 15:54:00,4.66,417.2,71166.0,-4.10,-9.32,0.23,10.19,1.356646
...,...,...,...,...,...,...,...,...
2025-11-28 15:45:00,3.32,657.6,209614.0,-2.90,3.66,1.18,4.81,2.401341
2025-11-28 15:46:00,3.70,665.6,186488.0,-4.74,2.57,-0.28,5.40,2.741703
2025-11-28 15:47:00,4.01,656.1,270875.0,-4.34,-2.37,-0.62,4.99,2.887198
2025-11-28 15:48:00,2.80,597.0,101547.0,-4.34,-2.37,-0.62,4.99,1.669163


In [15]:
solar_df_agg = solar_df.resample('h').agg(['mean', 'std'])
solar_df_agg.columns = ["_".join(col) for col in solar_df_agg]
solar_df_agg

Unnamed: 0,density_mean,density_std,speed_mean,speed_std,temperature_mean,temperature_std,bx_gsm_mean,bx_gsm_std,by_gsm_mean,by_gsm_std,bz_gsm_mean,bz_gsm_std,bt_mean,bt_std,pressure_mean,pressure_std
2025-11-21 15:00:00,4.919200,0.167715,418.895000,2.835338,69029.800000,3195.036576,-4.171000,0.678027,-9.006500,0.463286,-1.082500,1.852346,10.175000,0.095481,1.443911,0.055240
2025-11-21 16:00:00,5.177633,3.816786,430.175000,14.005253,62690.350000,12878.201176,-6.032500,0.607977,-7.833417,0.445399,-3.039333,0.576047,10.385083,0.193943,1.559200,0.815492
2025-11-21 17:00:00,4.765750,0.713614,418.753333,12.825481,76768.450000,15794.925060,-3.850000,1.962532,-8.670667,0.702487,-1.356167,1.981498,9.996333,0.235523,1.401170,0.237597
2025-11-21 18:00:00,4.392250,0.893244,435.688333,11.829639,81083.883333,31168.995200,-5.354667,0.599133,-7.177000,1.648516,-3.635500,1.882083,9.990167,0.333251,1.399945,0.310714
2025-11-21 19:00:00,2.621000,1.321339,449.850000,15.816335,210436.766667,74066.413582,-2.528667,1.815926,-6.049000,0.854640,2.108500,3.405715,7.892000,0.715354,0.893863,0.464391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-28 11:00:00,1.970000,1.106302,637.623333,17.813957,240125.983333,63293.439585,-3.645500,1.573849,0.597333,2.501482,-2.053667,2.357401,5.613000,0.650919,1.355158,0.810446
2025-11-28 12:00:00,1.475333,0.182222,645.229167,15.346187,256282.975000,36503.340745,-3.119167,1.513665,-4.363833,1.373628,-0.609167,1.919540,6.068167,0.247739,1.029778,0.148742
2025-11-28 13:00:00,2.298500,1.157601,637.876667,12.350436,216915.450000,48656.662301,-3.387833,1.419622,0.621167,3.046175,-2.035833,2.584273,5.791667,0.367263,1.561054,0.782200
2025-11-28 14:00:00,1.797833,0.644179,636.615833,19.689061,227620.950000,45028.026396,-3.224833,2.028967,0.602000,2.724526,-3.147000,1.828918,5.927667,0.419206,1.225389,0.456562
