### Import Required Libraries and Set Up Environment Variables

In [73]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')


### CME Data

In [74]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
# query_url = (
#     f"{base_url}api-key={NASA_API_KEY}&begin_date={startDate}&end_date={endDate}"
#     + f'&fq={filter_query}&sort={sort}&fl={field_list}'
# )
# https://api.nasa.gov/DONKI/CME?startDate=yyyy-MM-dd&endDate=yyyy-MM-dd&api_key=DEMO_KEY
cme_url = (
    f"{base_url}{CME}?startDate={startDate}&end_date={endDate}&api_key={NASA_API_KEY}"
)

In [75]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
response = requests.get(cme_url)
response

<Response [200]>

In [89]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = response.json()


In [90]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json[0], indent=4))

{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [91]:
# Convert cme_json to a Pandas DataFrame 
cme_df = pd.DataFrame(cme_json)
# Keep only the columns: activityID, startTime, linkedEvents
#cmd_df = cme_df[["activityID", "startTime", "linkedEvents"]]
cme_df = cme_df.loc[:,["activityID", "startTime", "linkedEvents"]]
cme_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
1,2013-05-02T05:24:00-CME-001,2013-05-02T05:24Z,
2,2013-05-02T14:36:00-CME-001,2013-05-02T14:36Z,
3,2013-05-03T18:00:00-CME-001,2013-05-03T18:00Z,
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]


In [92]:
cme_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6431 entries, 0 to 6430
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   activityID    6431 non-null   object
 1   startTime     6431 non-null   object
 2   linkedEvents  1231 non-null   object
dtypes: object(3)
memory usage: 150.9+ KB


In [93]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme = cme_df.dropna()
cme.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}]
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...
13,2013-05-13T16:18:00-CME-001,2013-05-13T16:18Z,[{'activityID': '2013-05-13T15:40:00-FLR-001'}...


In [94]:
cme.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1231 entries, 0 to 6414
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   activityID    1231 non-null   object
 1   startTime     1231 non-null   object
 2   linkedEvents  1231 non-null   object
dtypes: object(3)
memory usage: 38.5+ KB


In [95]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for i in cme.index:
    # Get the corresponding value from row i in 'activityID'
    activityID = cme['activityID'][i]
    # Get the corresponding value from row i in 'startTime'     
    startTime = cme['startTime'][i]
    # Get the list of dictionaries from row i in 'linkedEvents'
    linkedEvents =cme['linkedEvents'][i]
    
    # Iterate over each dictionary in the list
    for item in linkedEvents:
    # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({'activityID': activityID, 'startTime': startTime, 'linkedEvents': linkedEvents})



# Create a new DataFrame from the expanded rows
cme_expanded_df = pd.DataFrame(expanded_rows)
cme_expanded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}]
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...


In [96]:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   activityID    2139 non-null   object
 1   startTime     2139 non-null   object
 2   linkedEvents  2139 non-null   object
dtypes: object(3)
memory usage: 50.3+ KB


In [97]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
        # Log the error or print it for debugging
def extract_activityID_from_dict(input_dict):
    try:
        # print(input_dict)
        # print(input_dict[0].values())
        #input_dict[0].values()
        woot = input_dict[0].values()
        return woot
    except (ValueError, TypeError) as e:
        print(f"Error with dictionary: {input_dict}")
        return 

extract_activityID_from_dict(cme.loc[0,'linkedEvents'])


dict_values(['2013-05-04T04:52:00-IPS-001'])

In [98]:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   activityID    2139 non-null   object
 1   startTime     2139 non-null   object
 2   linkedEvents  2139 non-null   object
dtypes: object(3)
memory usage: 50.3+ KB


In [99]:
# cme["linkedEvents"][0]
# cme.loc[0,'linkedEvents']
# extract_activityID_from_dict(cme.loc[0,'linkedEvents'])
# extract_activityID_from_dict(cme['linkedEvents'][0])

In [100]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
# cme_expanded_df["GST_ActivityID"] =  cme_expanded_df.apply(lambda x: extract_activityID_from_dict(x['linkedEvents']), axis =1)
cme_expanded_df.loc[:, 'GST_ActivityID'] = cme_expanded_df["linkedEvents"].apply(lambda x:extract_activityID_from_dict(x))

In [107]:
cme_expanded_df.head()

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}],(2013-05-04T04:52:00-IPS-001)
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}],(2013-05-07T04:37:00-IPS-001)
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}],(2013-05-12T23:30:00-IPS-001)
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...,(2013-05-13T01:53:00-FLR-001)
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...,(2013-05-13T01:53:00-FLR-001)


In [108]:
# print out the datatype of each column in this DataFrame:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      2139 non-null   object
 1   startTime       2139 non-null   object
 2   linkedEvents    2139 non-null   object
 3   GST_ActivityID  2139 non-null   object
dtypes: object(4)
memory usage: 67.0+ KB


In [109]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_expanded_df = cme_expanded_df.dropna(subset=["GST_ActivityID"])

In [110]:
# print out the datatype of each column in this DataFrame:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      2139 non-null   object
 1   startTime       2139 non-null   object
 2   linkedEvents    2139 non-null   object
 3   GST_ActivityID  2139 non-null   object
dtypes: object(4)
memory usage: 67.0+ KB


In [116]:
cme_expanded_df["startTime"]

0       2013-05-01T03:12Z
1       2013-05-03T22:36Z
2       2013-05-09T19:29Z
3       2013-05-13T02:54Z
4       2013-05-13T02:54Z
              ...        
2134    2024-11-25T21:24Z
2135    2024-11-25T21:24Z
2136    2024-11-25T21:24Z
2137    2024-11-27T02:48Z
2138    2024-11-27T19:24Z
Name: startTime, Length: 2139, dtype: object

In [114]:
cme_expanded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      2139 non-null   object
 1   startTime       2139 non-null   object
 2   linkedEvents    2139 non-null   object
 3   GST_ActivityID  2139 non-null   string
dtypes: object(3), string(1)
memory usage: 67.0+ KB


In [117]:
# Convert the 'GST_ActivityID' column to string format 
cme_expanded_df["GST_ActivityID"] = cme_expanded_df["GST_ActivityID"].astype("string")
# Convert startTime to datetime format  
cme_expanded_df['startTime'] = pd.to_datetime(cme_expanded_df['startTime'])
# Rename startTime to startTime_CME and activityID to cmeID
cme_expanded_df = cme_expanded_df.rename(columns={'startTime':'startTime_cme','activityID':'cmeID'})
# Drop linkedEvents
cme_expanded_df = cme_expanded_df.drop(columns=['linkedEvents'])
# Verify that all steps were executed correctly
display(cme_expanded_df.head())
display(cme_expanded_df.info())

Unnamed: 0,cmeID,startTime_cme,GST_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01 03:12:00+00:00,dict_values(['2013-05-04T04:52:00-IPS-001'])
1,2013-05-03T22:36:00-CME-001,2013-05-03 22:36:00+00:00,dict_values(['2013-05-07T04:37:00-IPS-001'])
2,2013-05-09T19:29:00-CME-001,2013-05-09 19:29:00+00:00,dict_values(['2013-05-12T23:30:00-IPS-001'])
3,2013-05-13T02:54:00-CME-001,2013-05-13 02:54:00+00:00,dict_values(['2013-05-13T01:53:00-FLR-001'])
4,2013-05-13T02:54:00-CME-001,2013-05-13 02:54:00+00:00,dict_values(['2013-05-13T01:53:00-FLR-001'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           2139 non-null   object             
 1   startTime_cme   2139 non-null   datetime64[ns, UTC]
 2   GST_ActivityID  2139 non-null   string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 50.3+ KB


None

In [118]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
cme_filtered_df=cme_expanded_df[cme_expanded_df['GST_ActivityID'].str.contains('GST', na=False)]

In [119]:
cme_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 21 to 1850
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   cmeID           14 non-null     object             
 1   startTime_cme   14 non-null     datetime64[ns, UTC]
 2   GST_ActivityID  14 non-null     string             
dtypes: datetime64[ns, UTC](1), object(1), string(1)
memory usage: 448.0+ bytes


### GST Data

In [23]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
gst_url = (
    f"{base_url}GST?startDate={startDate}&end_date={endDate}&api_key={NASA_API_KEY}"
)

In [24]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)
gst_response

<Response [200]>

In [25]:
# Convert the response variable to json and store it as a variable named gst_json

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data


In [26]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: activityID, startTime, linkedEvents


In [27]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME


In [28]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.


In [29]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [30]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


In [31]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [32]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [33]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [34]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [35]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [36]:
# Export data to CSV without the index
