In [17]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Store CSV into Dataframe 

In [18]:
eruptions = "Resources/eruptions.csv"
eruptions_data = pd.read_csv(eruptions)

volcano = "Resources/volcano.csv"
volcano_df = pd.read_csv(volcano)

events = "Resources/events.csv"
events_df = pd.read_csv(events)

damage = "Resources/damage.csv"
damage_df = pd.read_csv(damage)


Transform eruptions dataframe

In [20]:
eruptions_data.dropna(subset=['start_year'], how='all', inplace=True)
eruptions_data['start_month'] = eruptions_data['start_month'].fillna(1)
eruptions_data['start_day'] = eruptions_data['start_day'].fillna(1)

eruptions_data.drop(eruptions_data[eruptions_data.start_year < 1800].index, inplace=True)
eruptions_data

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_category,area_of_activity,vei,start_year,start_month,start_day,evidence_method_dating,end_year,end_month,end_day,latitude,longitude
0,266030,Soputan,22354,Confirmed Eruption,,,2020.0,3.0,23.0,Historical Observations,2020.0,4.0,2.0,1.112,124.737
1,343100,San Miguel,22355,Confirmed Eruption,,,2020.0,2.0,22.0,Historical Observations,2020.0,2.0,22.0,13.434,-88.269
2,233020,"Fournaise, Piton de la",22343,Confirmed Eruption,,,2020.0,2.0,10.0,Historical Observations,2020.0,4.0,6.0,-21.244,55.708
3,345020,Rincon de la Vieja,22346,Confirmed Eruption,,,2020.0,1.0,31.0,Historical Observations,2020.0,4.0,17.0,10.830,-85.324
4,353010,Fernandina,22347,Confirmed Eruption,,,2020.0,1.0,12.0,Historical Observations,2020.0,1.0,12.0,-0.370,-91.550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6216,252080,Witori,15022,Confirmed Eruption,,4.0,1800.0,0.0,0.0,Tephrochronology,,,,-5.576,150.516
6217,241030,Taranaki,13348,Confirmed Eruption,,,1800.0,0.0,0.0,Tephrochronology,,,,-39.300,174.070
6218,390030,Deception Island,13182,Confirmed Eruption,N side caldera bay (near Telefon Bay),,1800.0,0.0,0.0,Historical Observations,,,,-63.001,-60.652
6219,344120,Concepcion,11046,Uncertain Eruption,,2.0,1800.0,0.0,0.0,,,,,11.538,-85.622


In [21]:
eruptions_data.dropna(subset=['end_year'], how='all', inplace=True)
eruptions_data['end_month'] = eruptions_data['end_month'].fillna(1)
eruptions_data['end_day'] = eruptions_data['end_day'].fillna(1)

In [22]:
#Converting start date columns to strings and concatenating them to a datetime value.

eruptions_data['start_year'] = eruptions_data['start_year'].apply(lambda x: int(x))
eruptions_data['start_month'] = eruptions_data['start_month'].apply(lambda x: int(x))
eruptions_data['start_day'] = eruptions_data['start_day'].apply(lambda x: int(x))

eruptions_data['start_year'] = eruptions_data['start_year'].astype(str)
eruptions_data['start_month'] = eruptions_data['start_month'].astype(str)
eruptions_data['start_day'] = eruptions_data['start_day'].astype(str)

eruptions_data["start date"] = eruptions_data["start_year"] + '.' + eruptions_data["start_month"] + '.' + eruptions_data["start_day"]

In [23]:
#Converting end date columns to strings and concatenating them to a datetime value.
eruptions_data['end_year'] = eruptions_data['end_year'].apply(lambda x: int(x))
eruptions_data['end_month'] = eruptions_data['end_month'].apply(lambda x: int(x))
eruptions_data['end_day'] = eruptions_data['end_day'].apply(lambda x: int(x))

eruptions_data['end_year'] = eruptions_data['end_year'].astype(str)
eruptions_data['end_month'] = eruptions_data['end_month'].astype(str)
eruptions_data['end_day'] = eruptions_data['end_day'].astype(str)

eruptions_data["end date"] = eruptions_data["end_year"] + '.' + eruptions_data["end_month"] + '.' + eruptions_data["end_day"]

In [28]:
#Dropping uneccessary columns
clean_eruption_data = eruptions_data.drop(columns=['area_of_activity', 'evidence_method_dating','vei','eruption_category','start_year','start_month','start_day','end_year','end_month','end_day'])
clean_eruption_data

Unnamed: 0,volcano_number,volcano_name,eruption_number,latitude,longitude,start date,end date
0,266030,Soputan,22354,1.112,124.737,2020.3.23,2020.4.2
1,343100,San Miguel,22355,13.434,-88.269,2020.2.22,2020.2.22
2,233020,"Fournaise, Piton de la",22343,-21.244,55.708,2020.2.10,2020.4.6
3,345020,Rincon de la Vieja,22346,10.830,-85.324,2020.1.31,2020.4.17
4,353010,Fernandina,22347,-0.370,-91.550,2020.1.12,2020.1.12
...,...,...,...,...,...,...,...
6202,233020,"Fournaise, Piton de la",14252,-21.244,55.708,1800.11.2,1800.11.8
6203,273030,Mayon,13603,13.257,123.685,1800.10.30,1800.10.31
6204,353020,Wolf,11682,0.020,-91.350,1800.8.21,1800.8.21
6205,382030,San Jorge,12993,38.650,-28.080,1800.6.24,1800.6.25


Transform volcano dataframe

In [25]:
# Step 1: Getting all data from 1800 onwards ( last eruption date after 1800)

#there are some string "Unknown" in last eruption year, change it into 0
volcano_df2=volcano_df.replace(to_replace="Unknown",value="0")

#convert the last eruption year to integer 
volcano_df2["last_eruption_year"] = volcano_df2["last_eruption_year"].astype(int)

#then choose years greater than 1800 
volcano_df3=volcano_df2.loc[volcano_df2['last_eruption_year'] > 1800]

In [26]:
# Step 2: clean data--Remove columns we dont need and set primary key Volcano_number as index

# set primary key Volcano_number as index
volcano_df3.set_index("volcano_number", inplace=True)

#remove irrelevant columns
volcano_df4=volcano_df3.drop(columns=["primary_volcano_type","subregion","elevation","evidence_category","tectonic_settings","major_rock_1",  "major_rock_2","major_rock_3","major_rock_4","major_rock_5" ,"minor_rock_1","minor_rock_2","minor_rock_3","minor_rock_4","minor_rock_5"])


In [27]:
# Step 3: Getting average population by country within differnt distance 
country_5km=(volcano_df4.groupby('country')['population_within_5_km'].sum()).to_frame()
country_10km=(volcano_df4.groupby('country')['population_within_10_km'].sum()).to_frame()
country_30km=(volcano_df4.groupby('country')['population_within_30_km'].sum()).to_frame()
country_100km=(volcano_df4.groupby('country')['population_within_100_km'].sum()).to_frame()
#getting result into a dataframe then drop rows with all 0/NAN 
population_df=pd.concat([country_5km, country_10km, country_30km,country_100km]).fillna(0.0)


Connect to local database

Load DataFrames into database

In [None]:
#volcano_df4.to_sql(name='volcano', con=engine, if_exists='append', index=True)
#population_df.to_sql(name='population_by_country', con=engine, if_exists='append', index=True)