In [1]:
# Dependencies and Setup

import pandas as pd
import numpy as np

eruptions = "Resources/eruptions.csv"
eruptions_data = pd.read_csv(eruptions)

events = "Resources/events.csv"
events_df = pd.read_csv(events)

volcano = "Resources/volcano.csv"
volcano_df = pd.read_csv(volcano)


In [7]:
#Replacing null values in dates with 1 before converting to string.
eruptions_data.dropna(subset=['start_year'], how='all', inplace=True)
eruptions_data['start_month'] = eruptions_data['start_month'].fillna(1)
eruptions_data['start_day'] = eruptions_data['start_day'].fillna(1)

eruptions_data.dropna(subset=['end_year'], how='all', inplace=True)
eruptions_data['end_month'] = eruptions_data['end_month'].fillna(1)
eruptions_data['end_day'] = eruptions_data['end_day'].fillna(1)

eruptions_data["start_month"].replace({0.0: 1}, inplace=True)
eruptions_data["start_day"].replace({0.0: 1}, inplace=True)
eruptions_data["end_day"].replace({0.0: 1}, inplace=True)
eruptions_data["end_month"].replace({0.0: 1}, inplace=True)

eruptions_data

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_category,area_of_activity,vei,start_year,start_month,start_day,evidence_method_dating,end_year,end_month,end_day,latitude,longitude
0,266030,Soputan,22354,Confirmed Eruption,,,2020.0,3.0,23.0,Historical Observations,2020.0,4.0,2.0,1.112,124.737
1,343100,San Miguel,22355,Confirmed Eruption,,,2020.0,2.0,22.0,Historical Observations,2020.0,2.0,22.0,13.434,-88.269
2,233020,"Fournaise, Piton de la",22343,Confirmed Eruption,,,2020.0,2.0,10.0,Historical Observations,2020.0,4.0,6.0,-21.244,55.708
3,345020,Rincon de la Vieja,22346,Confirmed Eruption,,,2020.0,1.0,31.0,Historical Observations,2020.0,4.0,17.0,10.830,-85.324
4,353010,Fernandina,22347,Confirmed Eruption,,,2020.0,1.0,12.0,Historical Observations,2020.0,1.0,12.0,-0.370,-91.550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,211060,Etna,13619,Confirmed Eruption,,,10.0,1.0,1.0,Historical Observations,20.0,1.0,1.0,37.748,14.999
8811,211060,Etna,13616,Confirmed Eruption,East side ?,,-36.0,7.0,15.0,Historical Observations,-35.0,1.0,1.0,37.748,14.999
8940,211020,Vesuvius,10049,Confirmed Eruption,,3.0,-217.0,1.0,1.0,Historical Observations,-216.0,1.0,1.0,40.821,14.426
9046,211060,Etna,13500,Confirmed Eruption,South flank (Monte Arso),,-425.0,3.0,15.0,Historical Observations,-424.0,1.0,1.0,37.748,14.999


In [8]:
#Dropping rows dating before 1800s

eruptions_data.drop(eruptions_data[eruptions_data.start_year < 1800].index, inplace=True)

In [9]:
#Converting start date columns to strings and concatenating them to a datetime value.

eruptions_data['start_year'] = eruptions_data['start_year'].apply(lambda x: int(x))
eruptions_data['start_month'] = eruptions_data['start_month'].apply(lambda x: int(x))
eruptions_data['start_day'] = eruptions_data['start_day'].apply(lambda x: int(x))

eruptions_data['start_year'] = eruptions_data['start_year'].astype(str)
eruptions_data['start_month'] = eruptions_data['start_month'].astype(str)
eruptions_data['start_day'] = eruptions_data['start_day'].astype(str)

eruptions_data["start date"] = eruptions_data["start_year"] + '.' + eruptions_data["start_month"] + '.' + eruptions_data["start_day"]

In [10]:
#Converting end date columns to strings and concatenating them to a datetime value.

eruptions_data['end_year'] = eruptions_data['end_year'].apply(lambda x: int(x))
eruptions_data['end_month'] = eruptions_data['end_month'].apply(lambda x: int(x))
eruptions_data['end_day'] = eruptions_data['end_day'].apply(lambda x: int(x))

eruptions_data['end_year'] = eruptions_data['end_year'].astype(str)
eruptions_data['end_month'] = eruptions_data['end_month'].astype(str)
eruptions_data['end_day'] = eruptions_data['end_day'].astype(str)

eruptions_data["end date"] = eruptions_data["end_year"] + '.' + eruptions_data["end_month"] + '.' + eruptions_data["end_day"]

In [11]:
#Dropping uneccessary columns
clean_eruption_data = eruptions_data.drop(columns=['area_of_activity', 'evidence_method_dating','vei','eruption_category','start_month','start_day','end_year','end_month','end_day'])

In [12]:
clean_eruption_data

Unnamed: 0,volcano_number,volcano_name,eruption_number,start_year,latitude,longitude,start date,end date
0,266030,Soputan,22354,2020,1.112,124.737,2020.3.23,2020.4.2
1,343100,San Miguel,22355,2020,13.434,-88.269,2020.2.22,2020.2.22
2,233020,"Fournaise, Piton de la",22343,2020,-21.244,55.708,2020.2.10,2020.4.6
3,345020,Rincon de la Vieja,22346,2020,10.830,-85.324,2020.1.31,2020.4.17
4,353010,Fernandina,22347,2020,-0.370,-91.550,2020.1.12,2020.1.12
...,...,...,...,...,...,...,...,...
6202,233020,"Fournaise, Piton de la",14252,1800,-21.244,55.708,1800.11.2,1800.11.8
6203,273030,Mayon,13603,1800,13.257,123.685,1800.10.30,1800.10.31
6204,353020,Wolf,11682,1800,0.020,-91.350,1800.8.21,1800.8.21
6205,382030,San Jorge,12993,1800,38.650,-28.080,1800.6.24,1800.6.25


In [13]:
clean_eruption_data["volcano_name"].value_counts().to_frame()

Unnamed: 0,volcano_name
"Fournaise, Piton de la",118
Etna,110
Asosan,76
Klyuchevskoy,65
Kilauea,60
...,...
Mocho-Choshuenco,1
Visoke,1
Pantelleria,1
Mayotte,1


In [14]:
clean_eruption_data["start_year"].value_counts().to_frame().sort_index(ascending=False)

Unnamed: 0,start_year
2020,7
2019,29
2018,39
2017,35
2016,35
...,...
1804,1
1803,6
1802,4
1801,2


In [16]:
clean_eruption_data['start date']= pd.to_datetime(clean_eruption_data['start date'])
clean_eruption_data['end date']= pd.to_datetime(clean_eruption_data['end date'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3932 entries, 0 to 6209
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   volcano_number   3932 non-null   int64         
 1   volcano_name     3932 non-null   object        
 2   eruption_number  3932 non-null   int64         
 3   start_year       3932 non-null   object        
 4   latitude         3932 non-null   float64       
 5   longitude        3932 non-null   float64       
 6   start date       3932 non-null   datetime64[ns]
 7   end date         3932 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(2), object(2)
memory usage: 276.5+ KB
