# **Data Retrival**

In [54]:
#| label: cell-dataretrival
# Necessary imports
import json
from datetime import datetime, timedelta, timezone
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xmltodict
import schedule
import time

## **Getting Data from USGS Earthquake API**

In [2]:
#| label: cell-dataretrival1
#get data of Earthquakes from API
def fetch_data():
    response = requests.get("https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson")
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
    # Convert JSON response to a dictionary
        data = response.json()
    else:
        print(f"Failed to fetch data from API. Status code: {response.status_code}")
    return data


In [3]:
#| label: cell-dataretrival2
data = fetch_data()

In [4]:
#| label: cell-dataretrival3
data["features"][0]

{'type': 'Feature',
 'properties': {'mag': 1.1,
  'place': '13 km N of Fishhook, Alaska',
  'time': 1718302410679,
  'updated': 1718302585284,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/ak0247l6gx1g',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=ak0247l6gx1g&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'automatic',
  'tsunami': 0,
  'sig': 19,
  'net': 'ak',
  'code': '0247l6gx1g',
  'ids': ',ak0247l6gx1g,',
  'sources': ',ak,',
  'types': ',origin,phase-data,',
  'nst': None,
  'dmin': None,
  'rms': 0.85,
  'gap': None,
  'magType': 'ml',
  'type': 'earthquake',
  'title': 'M 1.1 - 13 km N of Fishhook, Alaska'},
 'geometry': {'type': 'Point', 'coordinates': [-149.2821, 61.8655, 180.4]},
 'id': 'ak0247l6gx1g'}

In [5]:
#| label: cell-dataretrival4
# Extract desired information
generated = data['metadata']['generated']
url = data['metadata']['url']
title = data['metadata']['title']
count = data['metadata']['count']

# Print extracted information
print("Generated:", generated)
print("URL:", url)
print("Title:", title)
print("Count:", count)

Generated: 1718303157000
URL: https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson
Title: USGS Earthquakes
Count: 10179


## **Automate the data**

In [6]:
#| label: cell-dataretrival5
# Schedule the job to run at 11:58 PM UTC every day
schedule.every().day.at("23:58").do(fetch_data)

schedule.run_pending()

# **Raw data to Pandas DataFrame**

In [7]:
#| label: cell-datatopandas
df = pd.json_normalize(data['features'])
df

Unnamed: 0,type,id,properties.mag,properties.place,properties.time,properties.updated,properties.tz,properties.url,properties.detail,properties.felt,...,properties.types,properties.nst,properties.dmin,properties.rms,properties.gap,properties.magType,properties.type,properties.title,geometry.type,geometry.coordinates
0,Feature,ak0247l6gx1g,1.10,"13 km N of Fishhook, Alaska",1718302410679,1718302585284,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.85,,ml,earthquake,"M 1.1 - 13 km N of Fishhook, Alaska",Point,"[-149.2821, 61.8655, 180.4]"
1,Feature,hv74275801,2.36,"2 km WSW of Pāhala, Hawaii",1718302194290,1718302323380,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",38.0,0.119600,0.11,188.0,md,earthquake,"M 2.4 - 2 km WSW of Pāhala, Hawaii",Point,"[-155.49983215332, 19.1961669921875, 32.590000..."
2,Feature,ci40797520,2.68,"12 km WSW of Delta, B.C., MX",1718301998380,1718302126916,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",nearby-cities,origin,phase-data,scitech-link,",22.0,0.501200,0.23,220.0,ml,earthquake,"M 2.7 - 12 km WSW of Delta, B.C., MX",Point,"[-115.303833, 32.2966652, 7.46]"
3,Feature,ak0247l63lv6,2.10,"73 km SE of Cantwell, Alaska",1718301080508,1718301218013,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.93,,ml,earthquake,"M 2.1 - 73 km SE of Cantwell, Alaska",Point,"[-147.8545, 62.9508, 11.3]"
4,Feature,ak0247l63fcl,3.20,"26 km W of Chenega, Alaska",1718301037287,1718302752417,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.78,,ml,earthquake,"M 3.2 - 26 km W of Chenega, Alaska",Point,"[-148.4755, 60.108, 8.6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10174,Feature,av93126556,1.56,"73 km WSW of Nikolski, Alaska",1715712228980,1715817075320,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",5.0,0.060290,0.07,268.0,ml,earthquake,"M 1.6 - 73 km WSW of Nikolski, Alaska",Point,"[-169.895166666667, 52.7368333333333, -1.07]"
10175,Feature,ci40582351,0.70,"8 km N of Anza, CA",1715711994330,1715721874210,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",focal-mechanism,nearby-cities,origin,phase-da...",27.0,0.041800,0.09,86.0,ml,earthquake,"M 0.7 - 8 km N of Anza, CA",Point,"[-116.6728333, 33.627, 14.25]"
10176,Feature,hv74230877,1.60,"5 km NNE of Pāhala, Hawaii",1715711873890,1715736684390,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",11.0,0.031000,0.11,93.0,md,earthquake,"M 1.6 - 5 km NNE of Pāhala, Hawaii",Point,"[-155.466166666667, 19.2481666666667, 33.67]"
10177,Feature,ok2024jmgh,1.11,"3 km SSE of Mill Creek, Oklahoma",1715711839429,1715791212738,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",15.0,0.230355,0.39,145.0,ml,quarry blast,"M 1.1 Quarry Blast - 3 km SSE of Mill Creek, O...",Point,"[-96.81083333, 34.37583333, 0]"


# **Data Enrichment**

In [8]:
#| label: cell-dataenrichment
df_new = df.copy()
df_new.head()

Unnamed: 0,type,id,properties.mag,properties.place,properties.time,properties.updated,properties.tz,properties.url,properties.detail,properties.felt,...,properties.types,properties.nst,properties.dmin,properties.rms,properties.gap,properties.magType,properties.type,properties.title,geometry.type,geometry.coordinates
0,Feature,ak0247l6gx1g,1.1,"13 km N of Fishhook, Alaska",1718302410679,1718302585284,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.85,,ml,earthquake,"M 1.1 - 13 km N of Fishhook, Alaska",Point,"[-149.2821, 61.8655, 180.4]"
1,Feature,hv74275801,2.36,"2 km WSW of Pāhala, Hawaii",1718302194290,1718302323380,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",38.0,0.1196,0.11,188.0,md,earthquake,"M 2.4 - 2 km WSW of Pāhala, Hawaii",Point,"[-155.49983215332, 19.1961669921875, 32.590000..."
2,Feature,ci40797520,2.68,"12 km WSW of Delta, B.C., MX",1718301998380,1718302126916,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",nearby-cities,origin,phase-data,scitech-link,",22.0,0.5012,0.23,220.0,ml,earthquake,"M 2.7 - 12 km WSW of Delta, B.C., MX",Point,"[-115.303833, 32.2966652, 7.46]"
3,Feature,ak0247l63lv6,2.1,"73 km SE of Cantwell, Alaska",1718301080508,1718301218013,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.93,,ml,earthquake,"M 2.1 - 73 km SE of Cantwell, Alaska",Point,"[-147.8545, 62.9508, 11.3]"
4,Feature,ak0247l63fcl,3.2,"26 km W of Chenega, Alaska",1718301037287,1718302752417,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.78,,ml,earthquake,"M 3.2 - 26 km W of Chenega, Alaska",Point,"[-148.4755, 60.108, 8.6]"


In [9]:
#| label: cell-dataenrichment1
df_new['properties.time'] = pd.to_datetime(df_new['properties.time'], unit='ms')

In [10]:
#| label: cell-dataenrichment2
df_new['properties.updated'] = pd.to_datetime(df_new['properties.updated'], unit='ms')

In [11]:
#| label: cell-dataenrichment3
df_new.head()

Unnamed: 0,type,id,properties.mag,properties.place,properties.time,properties.updated,properties.tz,properties.url,properties.detail,properties.felt,...,properties.types,properties.nst,properties.dmin,properties.rms,properties.gap,properties.magType,properties.type,properties.title,geometry.type,geometry.coordinates
0,Feature,ak0247l6gx1g,1.1,"13 km N of Fishhook, Alaska",2024-06-13 18:13:30.679,2024-06-13 18:16:25.284,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.85,,ml,earthquake,"M 1.1 - 13 km N of Fishhook, Alaska",Point,"[-149.2821, 61.8655, 180.4]"
1,Feature,hv74275801,2.36,"2 km WSW of Pāhala, Hawaii",2024-06-13 18:09:54.290,2024-06-13 18:12:03.380,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",38.0,0.1196,0.11,188.0,md,earthquake,"M 2.4 - 2 km WSW of Pāhala, Hawaii",Point,"[-155.49983215332, 19.1961669921875, 32.590000..."
2,Feature,ci40797520,2.68,"12 km WSW of Delta, B.C., MX",2024-06-13 18:06:38.380,2024-06-13 18:08:46.916,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",nearby-cities,origin,phase-data,scitech-link,",22.0,0.5012,0.23,220.0,ml,earthquake,"M 2.7 - 12 km WSW of Delta, B.C., MX",Point,"[-115.303833, 32.2966652, 7.46]"
3,Feature,ak0247l63lv6,2.1,"73 km SE of Cantwell, Alaska",2024-06-13 17:51:20.508,2024-06-13 17:53:38.013,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.93,,ml,earthquake,"M 2.1 - 73 km SE of Cantwell, Alaska",Point,"[-147.8545, 62.9508, 11.3]"
4,Feature,ak0247l63fcl,3.2,"26 km W of Chenega, Alaska",2024-06-13 17:50:37.287,2024-06-13 18:19:12.417,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,",origin,phase-data,",,,0.78,,ml,earthquake,"M 3.2 - 26 km W of Chenega, Alaska",Point,"[-148.4755, 60.108, 8.6]"


### Extract Longitude, latitude and date data from pd

In [12]:
#| label: cell-dataenrichment4
# Split 'geometry.coordinates' into separate columns for longitude and latitude
df_new[['longitude', 'latitude', 'altitude']] = pd.DataFrame(df_new['geometry.coordinates'].tolist()).copy()

# Extract date from 'properties.time'
df_new['date'] = pd.to_datetime(df_new['properties.time']).dt.date.copy()
df_new['time'] = pd.to_datetime(df_new['properties.time']).dt.time.copy()


In [13]:
#| label: cell-dataenrichment5
# Extract 'longitude', 'latitude', and 'date' from df_new_filtered
data_list = df_new[['longitude', 'latitude', 'date']].values.tolist()

In [14]:
#| label: cell-dataenrichment6
data_list[10]

[-176.6682, -22.8177, datetime.date(2024, 6, 13)]

### **Getting Data from sunrisesunset.io api**

In [15]:
#| label: cell-dataenrichment7
import aiohttp
import asyncio

# Function to fetch sunrise and sunset data from the API asynchronously
async def fetch_sunrise_sunset_async(session, longitude, latitude, date):
    url = f"https://api.sunrisesunset.io/json?lat={latitude}&lng={longitude}&date={date}"
    try:
        async with session.get(url) as response:
            data = await response.json()
            if data:
                data['longitude'] = longitude
                data['latitude'] = latitude
            return data
    except Exception as e:
        print("Error fetching sunrise/sunset data:", e)
        return None

async def fetch_sunrise_sunset_batch_async(entries):
    api_data_batch = []
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_sunrise_sunset_async(session, *entry) for entry in entries]
        api_data_batch = await asyncio.gather(*tasks)
    return api_data_batch

batch_size = 500

api_data_list = []
for i in range(0, len(data_list), batch_size):
    batch = data_list[i:i+batch_size]
    api_data_batch = await fetch_sunrise_sunset_batch_async(batch)
    api_data_list.extend(api_data_batch)

In [16]:
#| label: cell-dataenrichment8
print(len(api_data_list))

10179


In [17]:
#| label: cell-dataenrichment9
api_data_df = pd.DataFrame(api_data_list)
results_df = pd.json_normalize(api_data_df['results'])
api_data_df = pd.concat([api_data_df.drop(columns=['results']), results_df], axis=1)

In [18]:
#| label: cell-dataenrichment10
api_data_df.head()

Unnamed: 0,status,longitude,latitude,date,sunrise,sunset,first_light,last_light,dawn,dusk,solar_noon,golden_hour,day_length,timezone,utc_offset
0,OK,-149.2821,61.8655,2024-06-13,4:12:02 AM,11:44:57 PM,,,,,1:58:29 PM,10:14:47 PM,19:32:55,America/Anchorage,-480
1,OK,-155.499832,19.196167,2024-06-13,5:45:09 AM,7:01:35 PM,4:21:59 AM,8:24:45 PM,5:20:48 AM,7:25:56 PM,12:23:22 PM,6:29:58 PM,13:16:25,Pacific/Honolulu,-600
2,OK,-115.303833,32.296665,2024-06-13,5:35:14 AM,7:49:53 PM,3:54:45 AM,9:30:22 PM,5:06:54 AM,8:18:13 PM,12:42:33 PM,7:13:53 PM,14:14:38,America/Tijuana,-420
3,OK,-147.8545,62.9508,2024-06-13,3:49:02 AM,11:56:31 PM,,,,,1:52:47 PM,10:17:11 PM,20:07:29,America/Anchorage,-480
4,OK,-148.4755,60.108,2024-06-13,4:31:13 AM,11:19:18 PM,,,2:47:56 AM,1:02:35 AM,1:55:16 PM,9:59:52 PM,18:48:04,America/Anchorage,-480


### Merging both dataframes

In [19]:
#| label: cell-dataenrichment11
# Convert 'date' column to datetime format in both DataFrames
df_new['date'] = pd.to_datetime(df_new['date'])
api_data_df['date'] = pd.to_datetime(api_data_df['date'])

In [20]:
#| label: cell-dataenrichment12
# Merge df_new_filtered with api_data_df on 'longitude', 'latitude', and 'date'
merged_data = pd.merge(df_new, api_data_df, 
                       on=['longitude', 'latitude', 'date'], 
                       how='inner')

In [21]:
#| label: cell-dataenrichment13
merged_data

Unnamed: 0,type,id,properties.mag,properties.place,properties.time,properties.updated,properties.tz,properties.url,properties.detail,properties.felt,...,sunset,first_light,last_light,dawn,dusk,solar_noon,golden_hour,day_length,timezone,utc_offset
0,Feature,ak0247l6gx1g,1.10,"13 km N of Fishhook, Alaska",2024-06-13 18:13:30.679,2024-06-13 18:16:25.284,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,11:44:57 PM,,,,,1:58:29 PM,10:14:47 PM,19:32:55,America/Anchorage,-480
1,Feature,hv74275801,2.36,"2 km WSW of Pāhala, Hawaii",2024-06-13 18:09:54.290,2024-06-13 18:12:03.380,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,7:01:35 PM,4:21:59 AM,8:24:45 PM,5:20:48 AM,7:25:56 PM,12:23:22 PM,6:29:58 PM,13:16:25,Pacific/Honolulu,-600
2,Feature,ci40797520,2.68,"12 km WSW of Delta, B.C., MX",2024-06-13 18:06:38.380,2024-06-13 18:08:46.916,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,7:49:53 PM,3:54:45 AM,9:30:22 PM,5:06:54 AM,8:18:13 PM,12:42:33 PM,7:13:53 PM,14:14:38,America/Tijuana,-420
3,Feature,ak0247l63lv6,2.10,"73 km SE of Cantwell, Alaska",2024-06-13 17:51:20.508,2024-06-13 17:53:38.013,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,11:56:31 PM,,,,,1:52:47 PM,10:17:11 PM,20:07:29,America/Anchorage,-480
4,Feature,ak0247l63fcl,3.20,"26 km W of Chenega, Alaska",2024-06-13 17:50:37.287,2024-06-13 18:19:12.417,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,11:19:18 PM,,,2:47:56 AM,1:02:35 AM,1:55:16 PM,9:59:52 PM,18:48:04,America/Anchorage,-480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10190,Feature,av93126556,1.56,"73 km WSW of Nikolski, Alaska",2024-05-14 18:43:48.980,2024-05-15 23:51:15.320,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,10:08:45 PM,3:02:35 AM,1:31:59 AM,5:42:47 AM,10:51:47 PM,2:17:17 PM,9:17:33 PM,15:42:55,America/Adak,-540
10191,Feature,ci40582351,0.70,"8 km N of Anza, CA",2024-05-14 18:39:54.330,2024-05-14 21:24:34.210,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,7:40:20 PM,4:12:41 AM,9:16:07 PM,5:21:04 AM,8:07:43 PM,12:44:24 PM,7:05:14 PM,13:51:51,America/Los_Angeles,-420
10192,Feature,hv74230877,1.60,"5 km NNE of Pāhala, Hawaii",2024-05-14 18:37:53.890,2024-05-15 01:31:24.390,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,6:50:17 PM,4:29:20 AM,8:09:49 PM,5:25:24 AM,7:13:44 PM,12:19:34 PM,6:19:42 PM,13:01:25,Pacific/Honolulu,-600
10193,Feature,ok2024jmgh,1.11,"3 km SSE of Mill Creek, Oklahoma",2024-05-14 18:37:19.429,2024-05-15 16:40:12.738,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,...,8:22:23 PM,4:50:25 AM,9:59:28 PM,5:59:49 AM,8:50:05 PM,1:24:57 PM,7:46:57 PM,13:54:52,America/Chicago,-300


# **Data Cleaning**

In the code below, we're cleaning the data by dropping unnecessary columns that are not required for visualizations. This is achieved using the drop function in pandas. Additionally, we're addressing duplicate rows and null values to ensure data integrity. Duplicate rows are removed using the drop_duplicates function, while null values are handled using either the dropna function to remove rows with null values or by imputing missing values with meaningful replacements. This cleaning strategy ensures that the dataset is properly prepared for further analysis and visualization.

In [22]:
#| label: cell-datacleaning
merged_data.dtypes

type                            object
id                              object
properties.mag                 float64
properties.place                object
properties.time         datetime64[ns]
properties.updated      datetime64[ns]
properties.tz                   object
properties.url                  object
properties.detail               object
properties.felt                float64
properties.cdi                 float64
properties.mmi                 float64
properties.alert                object
properties.status               object
properties.tsunami               int64
properties.sig                   int64
properties.net                  object
properties.code                 object
properties.ids                  object
properties.sources              object
properties.types                object
properties.nst                 float64
properties.dmin                float64
properties.rms                 float64
properties.gap                 float64
properties.magType       

In [23]:
#| label: cell-datacleaning1
columns_to_drop = ['type', 'id', 'properties.place', 'properties.time','properties.updated','properties.felt', 'properties.alert','properties.tz','properties.cdi','properties.mmi',
                   'properties.url', 'properties.detail', 'properties.status','properties.types','properties.code', 'properties.ids','geometry.type', 
                   'geometry.coordinates','status', 'timezone', 'first_light','last_light','solar_noon','utc_offset']

merged_data = merged_data.drop(columns=columns_to_drop)


In [24]:
#| label: cell-datacleaning2
def handle_duplicates(df):
    num_duplicates_before = df.duplicated().sum()
    df.drop_duplicates(inplace=True)
    num_duplicates_after = df.duplicated().sum()
    return df, num_duplicates_before, num_duplicates_after


merged_data, num_duplicates_before, num_duplicates_after = handle_duplicates(merged_data)
print("Number of duplicate rows before removing duplicates:", num_duplicates_before)
print("Number of duplicate rows after removing duplicates:", num_duplicates_after)


Number of duplicate rows before removing duplicates: 16
Number of duplicate rows after removing duplicates: 0


In [25]:
#| label: cell-datacleaning3
merged_data.isnull().sum()

properties.mag           0
properties.tsunami       0
properties.sig           0
properties.net           0
properties.sources       0
properties.nst        1407
properties.dmin       1410
properties.rms           0
properties.gap        1407
properties.magType       0
properties.type          0
properties.title         0
longitude                0
latitude                 0
altitude                 0
date                     0
time                     0
sunrise                 25
sunset                  25
dawn                   636
dusk                   636
golden_hour             11
day_length               0
dtype: int64

In [26]:
#| label: cell-datacleaning4
merged_data.dropna(inplace=True)

In [27]:
#| label: cell-datacleaning5
merged_data.isnull().sum()

properties.mag        0
properties.tsunami    0
properties.sig        0
properties.net        0
properties.sources    0
properties.nst        0
properties.dmin       0
properties.rms        0
properties.gap        0
properties.magType    0
properties.type       0
properties.title      0
longitude             0
latitude              0
altitude              0
date                  0
time                  0
sunrise               0
sunset                0
dawn                  0
dusk                  0
golden_hour           0
day_length            0
dtype: int64

In [28]:
#| label: cell-datacleaning6
merged_data.rename(columns={
    'properties.mag': 'Magnitude',
    'properties.tsunami': 'Tsunami_Alert',
    'properties.sig': 'Significance',
    'properties.net': 'Network',
    'properties.sources': 'Sources',
    'properties.nst': 'Number_of_Stations',
    'properties.dmin': 'Depth_Minimum',
    'properties.rms': 'Root_Mean_Square',
    'properties.gap': 'Gap',
    'properties.magType': 'Magnitude_Type',
    'properties.type': 'Event_Type',
    'properties.title': 'Event_Title'
}, inplace=True)

In [29]:
#| label: cell-datacleaning7
merged_data.head()

Unnamed: 0,Magnitude,Tsunami_Alert,Significance,Network,Sources,Number_of_Stations,Depth_Minimum,Root_Mean_Square,Gap,Magnitude_Type,...,latitude,altitude,date,time,sunrise,sunset,dawn,dusk,golden_hour,day_length
1,2.36,0,86,hv,",hv,",38.0,0.1196,0.11,188.0,md,...,19.196167,32.59,2024-06-13,18:09:54.290000,5:45:09 AM,7:01:35 PM,5:20:48 AM,7:25:56 PM,6:29:58 PM,13:16:25
2,2.68,0,110,ci,",ci,",22.0,0.5012,0.23,220.0,ml,...,32.296665,7.46,2024-06-13,18:06:38.380000,5:35:14 AM,7:49:53 PM,5:06:54 AM,8:18:13 PM,7:13:53 PM,14:14:38
5,2.5,0,96,us,",us,",12.0,0.129,0.09,257.0,ml,...,20.1376,35.246,2024-06-13,17:40:08.403000,5:43:46 AM,7:03:56 PM,5:19:13 AM,7:28:29 PM,6:32:06 PM,13:20:09
7,1.98,0,60,hv,",hv,",30.0,0.05885,0.17,60.0,ml,...,19.470667,1.47,2024-06-13,17:21:37.670000,5:44:26 AM,7:01:57 PM,5:20:01 AM,7:26:21 PM,6:30:16 PM,13:17:30
9,3.04,0,144,hv,",hv,us,",58.0,0.04547,0.19,93.0,ml,...,19.341499,4.98,2024-06-13,17:18:11.710000,5:43:26 AM,7:00:26 PM,5:19:02 AM,7:24:49 PM,6:28:47 PM,13:16:59


# **Test data quality**

To ensure the quality and integrity of our data, we implemented a series of tests and sanity checks. Firstly, we verified the absence of missing values in the tidied DataFrame using assert statements, confirming that no data was missing. Additionally, we conducted tests to ensure a reasonable number of rows in the DataFrame and to catch any potential data processing issues or errors. Furthermore, we checked for duplicate rows to maintain data integrity and tidiness, confirming that there were no duplicate entries. These tests were crucial in ensuring the reliability of our data for subsequent analysis and visualization.

In [30]:
#| label: cell-dataquality
# Verify no missing values
assert merged_data.isnull().sum().sum() == 0, "There are missing values in the tidied DataFrame."

# Verify reasonable number of rows
assert len(merged_data) > 0, "The tidied DataFrame is empty."
assert len(merged_data) < 10000, "The number of rows in the tidied DataFrame seems too large. Check if there's a data processing issue."

# Check for duplicates
assert len(merged_data) == len(merged_data.drop_duplicates()), "Duplicate rows exist in the tidied DataFrame."


# **Store your data in at least 3 different formats**

Storing data in multiple formats, such as SQLite3 databases, CSV files, and Excel spreadsheets, offers versatility and accessibility across different platforms and use cases. SQLite3 databases provide efficient querying and manipulation capabilities, ideal for applications requiring frequent data retrieval. CSV files ensure compatibility with various tools and platforms, facilitating easy data exchange and processing. Excel spreadsheets offer advanced formatting and visualization options, suitable for creating reports and dashboards. By employing these formats, we enhance data accessibility, interoperability, and usability, catering to diverse user preferences and requirements in data analysis, visualization, and reporting workflows.

In [31]:
#| label: cell-datastore
# Save as CSV
merged_data.to_csv('cleaned_data.csv', index=False)

In [32]:
#| label: cell-datastore1
# Save as Excel
merged_data.to_excel('cleaned_data.xlsx', index=False)

In [33]:
#| label: cell-datastore2
import sqlite3
conn = sqlite3.connect('merged_data.db')
merged_data.to_sql('merged_data_table', conn, index=False, if_exists='replace')
conn.commit()
conn.close()

print("Data successfully stored in SQLite database.")


Data successfully stored in SQLite database.
