In [43]:
import pandas as pd
import datetime as dt

# Suicide Data

In [2]:
suicide_data = pd.read_csv('suicide_data.csv') # suicide deaths per 100k

In [3]:
df = pd.DataFrame(suicide_data)

In [4]:
df.head()

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.0,,0.0,0 [0 – 0],,,EN,2021-02-09T06:00:00.000Z
1,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.11,,0.22,0.16 [0.11 – 0.22],,,EN,2021-02-09T06:00:00.000Z
2,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.22,,0.42,0.31 [0.22 – 0.42],,,EN,2021-02-09T06:00:00.000Z
3,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.22,,0.45,0.32 [0.22 – 0.45],,,EN,2021-02-09T06:00:00.000Z
4,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.34,,0.65,0.49 [0.34 – 0.65],,,EN,2021-02-09T06:00:00.000Z


## Data Cleaning

In [5]:
column_names = list(df.columns)

# print unique values for each column to get a sense of what they hold
for x in column_names:
    print(f"{x}   {df[x].unique()}")

IndicatorCode   ['MH_12']
Indicator   ['Age-standardized suicide rates (per 100 000 population)']
ValueType   ['numeric']
ParentLocationCode   ['AMR' 'EMR' 'WPR' 'SEAR' 'EUR' 'AFR']
ParentLocation   ['Americas' 'Eastern Mediterranean' 'Western Pacific' 'South-East Asia'
 'Europe' 'Africa']
Location type   ['Country']
SpatialDimValueCode   ['ATG' 'BRB' 'GRD' 'VCT' 'VEN' 'KWT' 'SYR' 'BRN' 'MDV' 'JOR' 'HND' 'PAN'
 'JAM' 'ARM' 'CYP' 'MMR' 'OMN' 'STP' 'IDN' 'TUR' 'BHS' 'PHL' 'SYC' 'PER'
 'GRC' 'LCA' 'AZE' 'BGD' 'LBN' 'QAT' 'COL' 'GHA' 'BLZ' 'TUN' 'DOM' 'DZA'
 'SAU' 'NIC' 'PNG' 'CRI' 'MAR' 'NGA' 'NER' 'CUB' 'NZL' 'GTM' 'BRA' 'CAN'
 'SSD' 'AUT' 'GBR' 'UGA' 'MLI' 'VNM' 'BGR' 'MWI' 'PRK' 'MKD' 'SEN' 'KEN'
 'HRV' 'DNK' 'IND' 'SLV' 'ISL' 'HTI' 'SLE' 'AUS' 'COM' 'PRT' 'COG' 'HUN'
 'SUR' 'LUX' 'UZB' 'ECU' 'DJI' 'EST' 'BDI' 'MDA' 'SRB' 'JPN' 'GIN' 'SWE'
 'GNB' 'COD' 'NLD' 'AGO' 'ROU' 'SGP' 'BEN' 'DEU' 'LKA' 'FJI' 'TTO' 'GAB'
 'FSM' 'TCD' 'GMB' 'MDG' 'CHL' 'NOR' 'FIN' 'KOR' 'ZWE' 'ARG' 'BIH' 'KGZ'
 '

In [6]:
suicide_df = df[['ParentLocation',
         'SpatialDimValueCode', 
         'Location',
         'Period',
         'Dim1',
         'FactValueNumeric',
         'FactValueNumericLow',
         'FactValueNumericHigh']]

In [7]:
suicide_df.head()

Unnamed: 0,ParentLocation,SpatialDimValueCode,Location,Period,Dim1,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,Americas,ATG,Antigua and Barbuda,2019,Male,0.0,0.0,0.0
1,Americas,BRB,Barbados,2019,Female,0.16,0.11,0.22
2,Americas,BRB,Barbados,2019,Both sexes,0.31,0.22,0.42
3,Americas,ATG,Antigua and Barbuda,2019,Both sexes,0.32,0.22,0.45
4,Americas,BRB,Barbados,2019,Male,0.49,0.34,0.65


# Earthquake Data

In [34]:
earthquake_data = pd.read_csv('Earthquakes/earthquake_data_all_years.csv')
earthquake_df = pd.DataFrame(earthquake_data)

In [35]:
earthquake_df.head()

Unnamed: 0,time,place,mag,type,geometry
0,1697495094131,Mariana Islands region,4.5,earthquake,"{'type': 'Point', 'coordinates': [147.5323, 17..."
1,1697482144060,"62 km S of Vinchina, Argentina",4.3,earthquake,"{'type': 'Point', 'coordinates': [-68.2561, -2..."
2,1697481199358,,5.0,earthquake,"{'type': 'Point', 'coordinates': [-67.1198, -2..."
3,1697471537407,"66 km SSE of Akutan, Alaska",4.3,earthquake,"{'type': 'Point', 'coordinates': [-165.4331, 5..."
4,1697471294804,Fiji region,4.1,earthquake,"{'type': 'Point', 'coordinates': [-177.6213, -..."


## Cleaning Data

In [36]:
earthquake_df.columns

Index(['time', 'place', 'mag', 'type', 'geometry'], dtype='object')

In [37]:
earthquake_df.dropna(subset = 'place', inplace = True)
earthquake_df.drop(columns = ['type', 'geometry'], inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [82]:
years = []

for row in range(earthquake_df.shape[0]):
    date = earthquake_df['time'][row]
    date = str(date)[:10]
    
    year = dt.date.fromtimestamp(int(date)).year
    years.append(year)

earthquake_df['year'] = years

In [83]:
earthquake_df.head()

Unnamed: 0,time,place,mag,locations,year
0,1697482144060,"62 km S of Vinchina, Argentina",4.3,Argentina,2023
1,1697471537407,"66 km SSE of Akutan, Alaska",4.3,United States of America,2023
2,1697469540781,"37 km W of Masachapa, Nicaragua",4.2,Nicaragua,2023
3,1697463051725,"36 km NNE of Hirara, Japan",4.7,Japan,2023
4,1697459227890,"159 km NNE of Cruz Bay, U.S. Virgin Islands",3.9,U.S. Virgin Islands,2023


In [77]:
dt.date.fromtimestamp(1697481199358)

OSError: [Errno 22] Invalid argument

In [79]:
str(1697507762)[:10]

'1697507762'

In [74]:
# dt.datetime(1697507762)
dt.date.fromtimestamp(1697507762).year


2023

In [38]:
locations = []
drop_index = []

for row in range(earthquake_df.shape[0]):
    place = earthquake_df['place'][row]
    
    if ("region" in place) or ("Ridge" in place):
        drop_index.append(row)
    else:

        location = str(place).split(",")[-1]
        location = location.strip()

        try:
            region_index = location.index('region')
            location = location[0: region_index].strip()
        except: ValueError

        locations.append(location)    
    
earthquake_df.drop(drop_index, inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [39]:
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Washington DC', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# Specific weird locations
for x in range(len(locations)):
    location = locations[x]
    
    if "Afghanistan" in location:
        locations[x] = "Afghanistan"
    elif "Fiji" in location:
        locations[x] = "Fiji"
    elif "Ecuador" in location:
        locations[x] = "Ecuador"
    elif "New Zealand" in location:
        locations[x] = "New Zealand"
    elif "Taiwan" in location:
        locations[x] = "Taiwan"
    elif "Alaska" in location or "WA" in location:
        locations[x] = "United States of America"
    elif "Guatemala" in location:
        locations[x] = "Guatemala"
    elif "Sumatra" in location:
        locations[x] = "Sumatra"
    elif "Bay of Bengal" in location:
        locations[x] = "India"
    elif "Italy" in location:
        locations[x] = "Italy"
        
    # Changing US states to United States of America
    
    elif location in us_states:
        locations[x] = 'United States of America'

In [40]:
earthquake_df['locations'] = locations

In [41]:
[print(x) for x in earthquake_df['locations'].unique()]

Argentina
United States of America
Nicaragua
Japan
U.S. Virgin Islands
CA
Fiji
Russia
Nepal
Canada
Dominican Republic
Chile
El Salvador
Puerto Rico
Indonesia
Afghanistan
Iran
New Zealand
Honduras
India
Papua New Guinea
Turkey
Taiwan
Greece
Mexico
Peru
Tajikistan
Timor Leste
Ecuador
Philippines
Solomon Islands
Banda Sea
Mongolia
Laos
east of the South Sandwich Islands
Tonga
Greenland Sea
Northern Mariana Islands
Panama
China
Guatemala
Kuril Islands
Micronesia
Vanuatu
south of Panama
South Africa
Iceland
off the coast of Central America
Bosnia and Herzegovina
off the coast of Oregon
Colombia
Kyrgyzstan
Spain
Morocco
west of Macquarie Island
Portugal
Wallis and Futuna
Indian Ocean Triple Junction
Davis Strait
Virgin Islands
Djibouti
Sumatra
Poland
Pakistan
Guam
Rwanda
Myanmar
New Caledonia
southern East Pacific Rise
Bolivia
Central Peru
Vietnam
eastern Greenland
south of the Kermadec Islands
central East Pacific Rise
southeast of the Loyalty Islands
Libya
West Chile Rise
Italy
Tanzania
Ca

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [42]:
earthquake_df.head()

Unnamed: 0,time,place,mag,locations
0,1697482144060,"62 km S of Vinchina, Argentina",4.3,Argentina
1,1697471537407,"66 km SSE of Akutan, Alaska",4.3,United States of America
2,1697469540781,"37 km W of Masachapa, Nicaragua",4.2,Nicaragua
3,1697463051725,"36 km NNE of Hirara, Japan",4.7,Japan
4,1697459227890,"159 km NNE of Cruz Bay, U.S. Virgin Islands",3.9,U.S. Virgin Islands
