### Import Required Libraries and Set Up Environment Variables

In [76]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [97]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"


# Build URL for CME
url = f"{base_url}{CME}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"


In [98]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(url)
cme_response

<Response [200]>

In [99]:
# Convert the response variable to json and store it as a variable named cme_json
if cme_response.status_code == 200:
    cme_json = cme_response.json()
else: 
    print("Error: ", cme_response.status_code)

In [100]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json[0], indent =4))

{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [101]:
# Convert cme_json to a Pandas DataFrame 
cme_df = pd.DataFrame(cme_json)
# Keep only the columns: activityID, startTime, linkedEvents
cme_df =cme_df[['activityID',  'startTime', 'linkedEvents']]
cme_df


Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
1,2013-05-02T05:24:00-CME-001,2013-05-02T05:24Z,
2,2013-05-02T14:36:00-CME-001,2013-05-02T14:36Z,
3,2013-05-03T18:00:00-CME-001,2013-05-03T18:00Z,
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
...,...,...,...
5519,2024-05-01T06:36:00-CME-001,2024-05-01T06:36Z,
5520,2024-05-01T11:36:00-CME-001,2024-05-01T11:36Z,
5521,2024-05-01T12:36:00-CME-001,2024-05-01T12:36Z,
5522,2024-05-01T17:36:00-CME-001,2024-05-01T17:36Z,


In [102]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df = cme_df.dropna(subset=['linkedEvents'])

In [103]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for i in cme_df.index:
    activityID = cme_df.loc[i, 'activityID']
    startTime = cme_df.loc[i, 'startTime']
    linkedEvents = cme_df.loc[i, 'linkedEvents']

# Iterate over each dictionary in the list
    for item in linkedEvents: 
# Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({'activityID': activityID, 'startTime': startTime,\
                               'linkedEvents': linkedEvents})
# Create a new DataFrame from the expanded rows
expanded_df =pd.DataFrame(expanded_rows)

In [104]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
def extract_activityID_from_dict(input_dict):
    try:
        return input_dict['activityID'] 
    except (ValueError, TypeError) as e:
        # Log the error or print it for debugging
        return None



In [105]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
cme_df['GST_ActivityID'] = cme_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cme_df['GST_ActivityID'] = cme_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))


In [106]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_df = cme_df.dropna(subset=['GST_ActivityID'])

In [107]:
# print out the datatype of each column in this DataFrame:
cme_df.dtypes

activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object

In [108]:
# Convert the 'GST_ActivityID' column to string format 
cme_df['GST_ActivityID'] = cme_df['GST_ActivityID'].astype(str)
# Convert startTime to datetime format  
cme_df['startTime'] = pd.to_datetime(cme_df['startTime'])
# Rename startTime to startTime_CME and activityID to cmeID
cme_df = cme_df.rename(columns={'startTime': 'startTime_CME', \
                                'activityID': 'cmeID'})
# Drop linkedEvents
cme_df = cme_df.drop('linkedEvents', axis=1)
# Verify that all steps were executed correctly


In [109]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
cme_df = cme_df[cme_df['GST_ActivityID'].str.contains('GST')]


### GST Data

In [110]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate = "2024-05-01"

# Build URL for GST
gst_url = f"{base_url}{GST}?start_date={startDate}&end_date={endDate}&api_key={NASA_API_KEY}"


In [111]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)
gst_response


<Response [200]>

In [112]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(gst_json, indent=4))


[
    {
        "gstID": "2024-08-28T00:00:00-GST-001",
        "startTime": "2024-08-28T00:00Z",
        "allKpIndex": [
            {
                "observedTime": "2024-08-28T03:00Z",
                "kpIndex": 5.67,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/33032/-1",
        "linkedEvents": [
            {
                "activityID": "2024-08-23T02:00:00-CME-001"
            },
            {
                "activityID": "2024-08-27T07:33:00-IPS-001"
            }
        ],
        "submissionTime": "2024-08-28T03:35Z",
        "versionId": 1
    },
    {
        "gstID": "2024-09-12T09:00:00-GST-001",
        "startTime": "2024-09-12T09:00Z",
        "allKpIndex": [
            {
                "observedTime": "2024-09-12T12:00Z",
                "kpIndex": 5.67,
                "source": "NOAA"
            },
            {
                "observedTime": "2024-09-12T15:00Z",
               

In [113]:
# Convert gst_json to a Pandas DataFrame  
gst_df = pd.DataFrame(gst_json)
# Keep only the columns: gstID, startTime, linkedEvents
gst_df = gst_df[['gstID', 'startTime', 'linkedEvents']]



In [114]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst_df = gst_df.dropna(subset=["linkedEvents"])

In [115]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
gst_df = gst_df.explode('linkedEvents')


In [116]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
gst_df["CME_ActivityID"] = gst_df['linkedEvents'].apply(lambda x:\
        extract_activityID_from_dict(x))
# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
gst_df = gst_df.dropna(subset=['CME_ActivityID'])

In [117]:
# Convert the 'CME_ActivityID' column to string format 
gst_df['CME_ActivityID'] = gst_df['CME_ActivityID'].astype(str)
# Convert the 'gstID' column to string format 
gst_df['gstID'] = gst_df['gstID'].astype(str)
# Convert startTime to datetime format  
gst_df['startTime'] = pd.to_datetime(gst_df['startTime'])
# Rename startTime to startTime_GST 
gst_df = gst_df.rename(columns={'startTime': 'startTime_GST'})
# Drop linkedEvents
gst_df = gst_df.drop(columns=['linkedEvents'])
# Verify that all steps were executed correctly
gst_df.dtypes

gstID                          object
startTime_GST     datetime64[ns, UTC]
CME_ActivityID                 object
dtype: object

In [118]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  
gst_df = gst_df[gst_df['CME_ActivityID'].str.contains('CME')]

### Merge both datatsets

In [120]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.
merged_gst_cme = pd.merge(gst_df, cme_df, left_on=['gstID', 'CME_ActivityID'], right_on=['cmeID', 'GST_ActivityID'], how='inner')

In [121]:
# Verify that the new DataFrame has the same number of rows as cme and gst
print('Shape of cme_df:', cme_df.shape)
print('Shape of gst_df:', gst_df.shape)
print("Shape of merged_gst_cme:", merged_gst_cme.shape)

Shape of cme_df: (0, 3)
Shape of gst_df: (3, 3)
Shape of merged_gst_cme: (0, 6)


### Computing the time it takes for a CME to cause a GST

In [124]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
merged_gst_cme['startTime_CME'] = merged_gst_cme['startTime_CME']\
    .dt.tz_localize(None)
merged_gst_cme['startTime_GST'] = merged_gst_cme['startTime_GST']\
    .dt.tz_localize(None)
merged_gst_cme['timeDiff'] = (merged_gst_cme['startTime_CME']\
     - merged_gst_cme['startTime_GST']).apply(lambda x: str(x))

In [125]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 
print(merged_gst_cme['timeDiff'].describe())

count      0
mean     NaT
std      NaT
min      NaT
25%      NaT
50%      NaT
75%      NaT
max      NaT
Name: timeDiff, dtype: object


### Exporting data in csv format

In [126]:
# Export data to CSV without the index
merged_gst_cme.to_csv('output.csv', index=False)