In [14]:
import json
import requests 
import urllib
from pathlib import Path
from datetime import datetime
import pandas as pd


Here we import all the necessary tools so that the necessary dependancies are globally declared

In [15]:
def collect_data(start_time,end_time,min_magnitude=3):
    #Here we write down the URL of the chosen API and get the format ready for data collection
    api_url = "https://earthquake.usgs.gov/fdsnws/event/1/query"

    #Below are the chosen parameters which will be used for the visualisation of data, cleaning will be done later
    params = {
    "format": "geojson",
    "starttime": start_time,
    "endtime": end_time,
    "minmagnitude": min_magnitude,

    }
    #Attempt to pull from the API using the chosen paramteres
    api_data = requests.get(api_url, params=params)
    #Create an empty dataframe
    earthquake_data=pd.DataFrame()   
    # Check if the request was successful and if so, begin collection
    if api_data.status_code == 200:
        data = api_data.json()
        # Extract earthquake features
        features = data['features']

        
        
        # Parse the data and store in a DataFrame
        earthquake_list = []
        for feature in features:
            properties = feature['properties']
            geometry = feature['geometry']
            #Rather than making a list and continuously apppending to it then converting to a dataframe, we concat the current
            #dataframe we have with a newly created one (one created by iterating through all features we want)
            earthquake_data=pd.concat([earthquake_data, pd.DataFrame([{
                "eventid": feature['id'],
                "time": properties['time'],
                "latitude": geometry['coordinates'][1],
                "longitude": geometry['coordinates'][0],
                "location": properties['place'],
                "depth": geometry['coordinates'][2],
                "magnitude": properties['mag'],
                
            }])]) 

        return earthquake_data

    else:
        print("Error, there is an issue with data retrieval, error code, ", api_data.status_code)
        return earthquake_data


In the above function, we define the paramters which will be use to call the AP. In this case we use the minimum magnitude, start and end time (These paramters determine the range in which we pull the data from), and the format which is used. Afterwards, we attempt to pull data from the API and if successful, we extract the required features from it and then save it to a Database, later to be converted to a CSV file. here we constantly append to the database rather than make a list and then append to it later.

In [18]:
full_earthquake_data=pd.DataFrame()

#Here we loop through the last 24 years, one by one in two 6-month blocks, to prevent any errors when retrieving from too large of data
for i in range(2000,2024):
    start=str(i)+("-01-01")
    end=str(i)+("-6-30")
    partial_earthquake_data=collect_data(start,end)
    full_earthquake_data = pd.concat([full_earthquake_data,partial_earthquake_data])
    start=str(i)+("-07-01")
    end=str(i)+("-12-30")
    partial_earthquake_data=collect_data(start,end)
    full_earthquake_data = pd.concat([full_earthquake_data,partial_earthquake_data])

#As the API collects data till now, we want to add any data collected till now
partial_earthquake_data=collect_data("2024-01-01",datetime.today().strftime('%Y-%m-%d'))
full_earthquake_data = pd.concat([full_earthquake_data,partial_earthquake_data])

# Save the dataset to a CSV file
csv_file = "raw_earthquake_data.csv"
full_earthquake_data.to_csv(csv_file, index=False)

# Output message
print(f"Data collection complete. Dataset saved as ", csv_file, "to path ", Path.cwd())

full_earthquake_data.head(10)


Data collection complete. Dataset saved as  raw_earthquake_data.csv to path  d:\University_Work\Data Science


Unnamed: 0,eventid,time,latitude,longitude,location,depth,magnitude
0,usp0009vf6,962320972290,29.796,-42.88,northern Mid-Atlantic Ridge,10.0,4.5
0,usp0009vf5,962320920420,6.382,-72.804,"4 km NNE of Onzaga, Colombia",33.0,4.3
0,usp0009vf4,962320856800,-30.232,-71.53,"35 km SSW of Coquimbo, Chile",51.0,4.3
0,usp0009vf3,962312599430,-55.758,-28.47,South Sandwich Islands region,33.0,4.9
0,usp0009vf0,962310160220,7.617,-37.095,central Mid-Atlantic Ridge,10.0,4.6
0,uw10524793,962306846760,48.466833,-123.104167,"9 km SW of Friday Harbor, Washington",27.204,3.5
0,usp0009vex,962304843940,33.997,139.35,"84 km SSE of Shimoda, Japan",10.0,4.4
0,usp0009vew,962300683320,36.346,138.669,"20 km W of Annaka, Japan",165.8,4.4
0,usp0009veu,962300277980,13.523,144.04,"68 km WNW of Agat Village, Guam",33.0,4.5
0,usp0009vet,962299947700,33.92,139.46,"96 km SSE of Shimoda, Japan",10.0,4.1


Over here, we create the data frame which will be used to store the data we pulled from the API

To prevent any errors when collecting data fom the API, we collect the data in two half-year blocks per year in a single loop and filter using a pre-set minimum magnitude of 3. Since an error will occur if we try to pull too much data at once, this is done to circumvent this and get a lot of data to have more accurate graphs for the second half of the assignment. Hoewver as data is still being added to it til lthis day, we also pull data from the API till this day to make sure we don't miss any data