In [1]:
import pandas as pd
import datetime as dt

# Suicide Data

In [2]:
suicide_data = pd.read_csv('suicide_data.csv') # suicide deaths per 100k

In [3]:
df = pd.DataFrame(suicide_data)

In [4]:
df.head()

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.0,,0.0,0 [0 – 0],,,EN,2021-02-09T06:00:00.000Z
1,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.11,,0.22,0.16 [0.11 – 0.22],,,EN,2021-02-09T06:00:00.000Z
2,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.22,,0.42,0.31 [0.22 – 0.42],,,EN,2021-02-09T06:00:00.000Z
3,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.22,,0.45,0.32 [0.22 – 0.45],,,EN,2021-02-09T06:00:00.000Z
4,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.34,,0.65,0.49 [0.34 – 0.65],,,EN,2021-02-09T06:00:00.000Z


## Data Cleaning

In [5]:
column_names = list(df.columns)

# print unique values for each column to get a sense of what they hold
for x in column_names:
    print(f"{x}   {df[x].unique()}")

IndicatorCode   ['MH_12']
Indicator   ['Age-standardized suicide rates (per 100 000 population)']
ValueType   ['numeric']
ParentLocationCode   ['AMR' 'EMR' 'WPR' 'SEAR' 'EUR' 'AFR']
ParentLocation   ['Americas' 'Eastern Mediterranean' 'Western Pacific' 'South-East Asia'
 'Europe' 'Africa']
Location type   ['Country']
SpatialDimValueCode   ['ATG' 'BRB' 'GRD' 'VCT' 'VEN' 'KWT' 'SYR' 'BRN' 'MDV' 'JOR' 'HND' 'PAN'
 'JAM' 'ARM' 'CYP' 'MMR' 'OMN' 'STP' 'IDN' 'TUR' 'BHS' 'PHL' 'SYC' 'PER'
 'GRC' 'LCA' 'AZE' 'BGD' 'LBN' 'QAT' 'COL' 'GHA' 'BLZ' 'TUN' 'DOM' 'DZA'
 'SAU' 'NIC' 'PNG' 'CRI' 'MAR' 'NGA' 'NER' 'CUB' 'NZL' 'GTM' 'BRA' 'CAN'
 'SSD' 'AUT' 'GBR' 'UGA' 'MLI' 'VNM' 'BGR' 'MWI' 'PRK' 'MKD' 'SEN' 'KEN'
 'HRV' 'DNK' 'IND' 'SLV' 'ISL' 'HTI' 'SLE' 'AUS' 'COM' 'PRT' 'COG' 'HUN'
 'SUR' 'LUX' 'UZB' 'ECU' 'DJI' 'EST' 'BDI' 'MDA' 'SRB' 'JPN' 'GIN' 'SWE'
 'GNB' 'COD' 'NLD' 'AGO' 'ROU' 'SGP' 'BEN' 'DEU' 'LKA' 'FJI' 'TTO' 'GAB'
 'FSM' 'TCD' 'GMB' 'MDG' 'CHL' 'NOR' 'FIN' 'KOR' 'ZWE' 'ARG' 'BIH' 'KGZ'
 '

In [6]:
drop_index = []

for row in range(df.shape[0]):
    gender = df['Dim1'][row]
    
    if gender != 'Both sexes':
        drop_index.append(row) 
    
df.drop(drop_index, inplace = True)
df.reset_index(drop=True, inplace = True)

In [7]:
suicide_df = df[['Location',
         'Period',
         'FactValueNumeric',
         'FactValueNumericLow',
         'FactValueNumericHigh']]

In [8]:
suicide_df.rename(columns = {"Period": "year"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suicide_df.rename(columns = {"Period": "year"}, inplace = True)


In [9]:
suicide_df.head()

Unnamed: 0,Location,year,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,Barbados,2019,0.31,0.22,0.42
1,Antigua and Barbuda,2019,0.32,0.22,0.45
2,Grenada,2019,0.64,0.45,0.88
3,Saint Vincent and the Grenadines,2019,1.01,0.7,1.41
4,Jordan,2019,1.98,1.29,2.9


# Earthquake Data

In [10]:
earthquake_data = pd.read_csv('Earthquakes/earthquake_data_all_years.csv')
earthquake_df = pd.DataFrame(earthquake_data)

In [11]:
earthquake_df.head()

Unnamed: 0,time,place,mag,type,geometry
0,1697495094131,Mariana Islands region,4.5,earthquake,"{'type': 'Point', 'coordinates': [147.5323, 17..."
1,1697482144060,"62 km S of Vinchina, Argentina",4.3,earthquake,"{'type': 'Point', 'coordinates': [-68.2561, -2..."
2,1697481199358,,5.0,earthquake,"{'type': 'Point', 'coordinates': [-67.1198, -2..."
3,1697471537407,"66 km SSE of Akutan, Alaska",4.3,earthquake,"{'type': 'Point', 'coordinates': [-165.4331, 5..."
4,1697471294804,Fiji region,4.1,earthquake,"{'type': 'Point', 'coordinates': [-177.6213, -..."


## Cleaning Data

In [12]:
earthquake_df.columns

Index(['time', 'place', 'mag', 'type', 'geometry'], dtype='object')

In [13]:
earthquake_df.dropna(subset = 'place', inplace = True)
earthquake_df.drop(columns = ['type', 'geometry'], inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [14]:
years = []

for row in range(earthquake_df.shape[0]):
    date = earthquake_df['time'][row]
    date = str(date)[:10]
    
    year = dt.date.fromtimestamp(int(date)).year
    years.append(year)

earthquake_df['year'] = years

In [15]:
earthquake_df.head()

Unnamed: 0,time,place,mag,year
0,1697495094131,Mariana Islands region,4.5,2023
1,1697482144060,"62 km S of Vinchina, Argentina",4.3,2023
2,1697471537407,"66 km SSE of Akutan, Alaska",4.3,2023
3,1697471294804,Fiji region,4.1,2023
4,1697469540781,"37 km W of Masachapa, Nicaragua",4.2,2023


In [16]:
locations = []
drop_index = []

for row in range(earthquake_df.shape[0]):
    place = earthquake_df['place'][row]
    
    if ("region" in place) or ("Ridge" in place):
        drop_index.append(row)
    else:

        location = str(place).split(",")[-1]
        location = location.strip()

        try:
            region_index = location.index('region')
            location = location[0: region_index].strip()
        except: ValueError

        locations.append(location)    
    
earthquake_df.drop(drop_index, inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [17]:
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Washington DC', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# Specific weird locations
for x in range(len(locations)):
    location = locations[x]
    
    if "Afghanistan" in location:
        locations[x] = "Afghanistan"
    elif "Fiji" in location:
        locations[x] = "Fiji"
    elif "Ecuador" in location:
        locations[x] = "Ecuador"
    elif "New Zealand" in location:
        locations[x] = "New Zealand"
    elif "Taiwan" in location:
        locations[x] = "Taiwan"
    elif "Alaska" in location or "WA" in location:
        locations[x] = "United States of America"
    elif "Guatemala" in location:
        locations[x] = "Guatemala"
    elif "Sumatra" in location:
        locations[x] = "Sumatra"
    elif "Bay of Bengal" in location:
        locations[x] = "India"
    elif "Italy" in location:
        locations[x] = "Italy"
        
    # Changing US states to United States of America
    
    elif location in us_states:
        locations[x] = 'United States of America'

In [18]:
earthquake_df['Location'] = locations

In [19]:
earthquake_df.head()

Unnamed: 0,time,place,mag,year,Location
0,1697482144060,"62 km S of Vinchina, Argentina",4.3,2023,Argentina
1,1697471537407,"66 km SSE of Akutan, Alaska",4.3,2023,United States of America
2,1697469540781,"37 km W of Masachapa, Nicaragua",4.2,2023,Nicaragua
3,1697463051725,"36 km NNE of Hirara, Japan",4.7,2023,Japan
4,1697459227890,"159 km NNE of Cruz Bay, U.S. Virgin Islands",3.9,2023,U.S. Virgin Islands


In [20]:
earthquake_df.drop(columns = ['time', 'place'], inplace = True)

In [21]:
earthquake_df.head()

Unnamed: 0,mag,year,Location
0,4.3,2023,Argentina
1,4.3,2023,United States of America
2,4.2,2023,Nicaragua
3,4.7,2023,Japan
4,3.9,2023,U.S. Virgin Islands


# Combining both

In [22]:
earthquake_suicide_df = earthquake_df.merge(right =  suicide_df, how = 'outer', on = ['Location', 'year'])

In [23]:
earthquake_suicide_df

Unnamed: 0,mag,year,Location,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,4.3,2023,Argentina,,,
1,4.4,2023,Argentina,,,
2,4.3,2023,Argentina,,,
3,6.0,2023,Argentina,,,
4,4.4,2023,Argentina,,,
...,...,...,...,...,...,...
4309,,2000,Argentina,9.20,7.90,10.64
4310,,2000,Niger,9.51,5.15,16.28
4311,,2000,Guinea,9.73,5.57,15.96
4312,,2000,Ghana,9.75,6.46,14.30


In [24]:
earthquake_df['year'].unique()

array([2023], dtype=int64)

In [25]:
earthquake_suicide_df[earthquake_suicide_df["Location"] == 'Japan']

Unnamed: 0,mag,year,Location,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
75,4.7,2023,Japan,,,
76,5.7,2023,Japan,,,
77,5.2,2023,Japan,,,
78,4.6,2023,Japan,,,
79,4.8,2023,Japan,,,
80,4.9,2023,Japan,,,
81,5.0,2023,Japan,,,
82,4.4,2023,Japan,,,
83,5.1,2023,Japan,,,
84,4.5,2023,Japan,,,


In [26]:
for x in range(earthquake_suicide_df.shape[0]):
    print(f'{earthquake_suicide_df["mag"][x]}   {earthquake_suicide_df["FactValueNumeric"][x]}')

4.3   nan
4.4   nan
4.3   nan
6.0   nan
4.4   nan
5.6   nan
4.2   nan
4.6   nan
4.2   nan
4.4   nan
4.0   nan
4.4   nan
4.4   nan
4.4   nan
4.1   nan
4.3   nan
4.5   nan
4.7   nan
4.3   nan
4.3   nan
6.4   nan
6.7   nan
3.6   nan
3.8   nan
5.6   nan
3.9   nan
4.2   nan
4.0   nan
4.34   nan
3.6   nan
3.9   nan
4.0   nan
4.1   nan
4.2   nan
4.1   nan
4.4   nan
3.9   nan
4.0   nan
4.0   nan
3.8   nan
4.3   nan
3.6   nan
3.6   nan
4.2   nan
3.6   nan
3.7   nan
4.6   nan
4.7   nan
3.9   nan
4.0   nan
3.7   nan
3.68   nan
3.9   nan
5.0   nan
4.0   nan
3.6   nan
3.8   nan
3.9   nan
3.9   nan
3.8   nan
4.1   nan
5.0   nan
3.7   nan
5.4   nan
3.7   nan
4.3   nan
4.4   nan
3.6   nan
3.8   nan
3.9   nan
3.7   nan
3.9   nan
4.2   nan
4.2   nan
4.6   nan
4.7   nan
5.7   nan
5.2   nan
4.6   nan
4.8   nan
4.9   nan
5.0   nan
4.4   nan
5.1   nan
4.5   nan
4.6   nan
4.4   nan
4.6   nan
4.5   nan
4.4   nan
4.4   nan
4.5   nan
4.5   nan
4.5   nan
4.5   nan
4.3   nan
4.4   nan
4.4   nan
4.6   nan
4.4   na