In [1]:
import pandas as pd
import datetime as dt

# Suicide Data

In [2]:
suicide_data = pd.read_csv('suicide_data.csv') # suicide deaths per 100k

In [3]:
suicide_data.shape

(10980, 34)

In [4]:
df = pd.DataFrame(suicide_data)

In [5]:
df.head()

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.0,,0.0,0 [0 – 0],,,EN,2021-02-09T06:00:00.000Z
1,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.11,,0.22,0.16 [0.11 – 0.22],,,EN,2021-02-09T06:00:00.000Z
2,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.22,,0.42,0.31 [0.22 – 0.42],,,EN,2021-02-09T06:00:00.000Z
3,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,ATG,Antigua and Barbuda,Year,2019,...,,,0.22,,0.45,0.32 [0.22 – 0.45],,,EN,2021-02-09T06:00:00.000Z
4,MH_12,Age-standardized suicide rates (per 100 000 po...,numeric,AMR,Americas,Country,BRB,Barbados,Year,2019,...,,,0.34,,0.65,0.49 [0.34 – 0.65],,,EN,2021-02-09T06:00:00.000Z


## Data Cleaning

In [6]:
column_names = list(df.columns)

# print unique values for each column to get a sense of what they hold
for x in column_names:
    print(f"{x}   {df[x].unique()}")

IndicatorCode   ['MH_12']
Indicator   ['Age-standardized suicide rates (per 100 000 population)']
ValueType   ['numeric']
ParentLocationCode   ['AMR' 'EMR' 'WPR' 'SEAR' 'EUR' 'AFR']
ParentLocation   ['Americas' 'Eastern Mediterranean' 'Western Pacific' 'South-East Asia'
 'Europe' 'Africa']
Location type   ['Country']
SpatialDimValueCode   ['ATG' 'BRB' 'GRD' 'VCT' 'VEN' 'KWT' 'SYR' 'BRN' 'MDV' 'JOR' 'HND' 'PAN'
 'JAM' 'ARM' 'CYP' 'MMR' 'OMN' 'STP' 'IDN' 'TUR' 'BHS' 'PHL' 'SYC' 'PER'
 'GRC' 'LCA' 'AZE' 'BGD' 'LBN' 'QAT' 'COL' 'GHA' 'BLZ' 'TUN' 'DOM' 'DZA'
 'SAU' 'NIC' 'PNG' 'CRI' 'MAR' 'NGA' 'NER' 'CUB' 'NZL' 'GTM' 'BRA' 'CAN'
 'SSD' 'AUT' 'GBR' 'UGA' 'MLI' 'VNM' 'BGR' 'MWI' 'PRK' 'MKD' 'SEN' 'KEN'
 'HRV' 'DNK' 'IND' 'SLV' 'ISL' 'HTI' 'SLE' 'AUS' 'COM' 'PRT' 'COG' 'HUN'
 'SUR' 'LUX' 'UZB' 'ECU' 'DJI' 'EST' 'BDI' 'MDA' 'SRB' 'JPN' 'GIN' 'SWE'
 'GNB' 'COD' 'NLD' 'AGO' 'ROU' 'SGP' 'BEN' 'DEU' 'LKA' 'FJI' 'TTO' 'GAB'
 'FSM' 'TCD' 'GMB' 'MDG' 'CHL' 'NOR' 'FIN' 'KOR' 'ZWE' 'ARG' 'BIH' 'KGZ'
 '

In [7]:
drop_index = []

for row in range(df.shape[0]):
    gender = df['Dim1'][row]
    
    if gender != 'Both sexes':
        drop_index.append(row) 
    
df.drop(drop_index, inplace = True)
df.reset_index(drop=True, inplace = True)

In [8]:
suicide_df = df[['Location',
         'Period',
         'FactValueNumeric',
         'FactValueNumericLow',
         'FactValueNumericHigh']]

In [9]:
suicide_df.rename(columns = {"Period": "year"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suicide_df.rename(columns = {"Period": "year"}, inplace = True)


In [10]:
suicide_df.head()

Unnamed: 0,Location,year,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,Barbados,2019,0.31,0.22,0.42
1,Antigua and Barbuda,2019,0.32,0.22,0.45
2,Grenada,2019,0.64,0.45,0.88
3,Saint Vincent and the Grenadines,2019,1.01,0.7,1.41
4,Jordan,2019,1.98,1.29,2.9


# Earthquake Data

In [11]:
earthquake_data = pd.read_csv('Earthquakes/earthquake_data_all_years.csv')
earthquake_df = pd.DataFrame(earthquake_data)

In [12]:
earthquake_df.head()

Unnamed: 0,time,place,mag,type,geometry
0,978216704470,"50 km SW of Ashk?sham, Afghanistan",4.6,earthquake,"{'type': 'Point', 'coordinates': [71.126, 36.3..."
1,978216085570,"124 km S of Kokopo, Papua New Guinea",4.1,earthquake,"{'type': 'Point', 'coordinates': [152.303, -5...."
2,978208040890,"118 km S of Kokopo, Papua New Guinea",4.3,earthquake,"{'type': 'Point', 'coordinates': [152.377, -5...."
3,978206160040,"297 km SSE of Alo, Wallis and Futuna",4.0,earthquake,"{'type': 'Point', 'coordinates': [-177.261, -1..."
4,978205372710,"16 km S of Yonakuni, Japan",5.1,earthquake,"{'type': 'Point', 'coordinates': [122.974, 24...."


In [13]:
min(earthquake_df['time'])

946689566990

## Cleaning Data

In [14]:
earthquake_df.columns

Index(['time', 'place', 'mag', 'type', 'geometry'], dtype='object')

In [15]:
earthquake_df.dropna(subset = 'place', inplace = True)
earthquake_df.drop(columns = ['type', 'geometry'], inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [16]:
years = []

for row in range(earthquake_df.shape[0]):
    date = earthquake_df['time'][row]
    
    if date > 1000000000000:
        date = str(date)[:10]
    else:
        date = str(date)[:9]
    
    year = dt.date.fromtimestamp(int(date)).year
    years.append(year)

earthquake_df['year'] = years

In [17]:
earthquake_df.head()

Unnamed: 0,time,place,mag,year
0,978216704470,"50 km SW of Ashk?sham, Afghanistan",4.6,2000
1,978216085570,"124 km S of Kokopo, Papua New Guinea",4.1,2000
2,978208040890,"118 km S of Kokopo, Papua New Guinea",4.3,2000
3,978206160040,"297 km SSE of Alo, Wallis and Futuna",4.0,2000
4,978205372710,"16 km S of Yonakuni, Japan",5.1,2000


In [18]:
suicide_countries = suicide_df['Location'].unique()

In [19]:
locations = []
drop_index = []

for row in range(earthquake_df.shape[0]):
    place = earthquake_df['place'][row]
    
    if "Ridge" in place:
        drop_index.append(row)
    else:

        location = str(place).split(",")[-1]
        location = location.strip()

        try:
            region_index = location.index('region')
            location = location[0: region_index].strip()
        except: ValueError

        locations.append(location)    
    
earthquake_df.drop(drop_index, inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [20]:
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Washington DC', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# Specific weird locations
for x in range(len(locations)):
    location = locations[x]
    
    if "Afghanistan" in location:
        locations[x] = "Afghanistan"
    elif "Fiji" in location:
        locations[x] = "Fiji"
    elif "Ecuador" in location:
        locations[x] = "Ecuador"
    elif "New Zealand" in location:
        locations[x] = "New Zealand"
    elif "Taiwan" in location:
        locations[x] = "Taiwan"
    elif "Alaska" in location or "WA" in location:
        locations[x] = "United States of America"
    elif "Guatemala" in location:
        locations[x] = "Guatemala"
    elif "Sumatra" in location:
        locations[x] = "Sumatra"
    elif "Bay of Bengal" in location:
        locations[x] = "India"
    elif "Italy" in location:
        locations[x] = "Italy"
        
    # Changing US states to United States of America
    
    elif location in us_states:
        locations[x] = 'United States of America'

In [21]:
earthquake_df['Location'] = locations

In [22]:
drop_index = []
not_suicidal = []

for row in range(earthquake_df.shape[0]):
    location = earthquake_df['Location'][row]
    
    if not(location in suicide_countries):
            drop_index.append(row)
            not_suicidal.append(location)
            
earthquake_df.drop(drop_index, inplace = True)
earthquake_df.reset_index(drop=True, inplace = True)

In [23]:
earthquake_df.head()

Unnamed: 0,time,place,mag,year,Location
0,978216704470,"50 km SW of Ashk?sham, Afghanistan",4.6,2000,Afghanistan
1,978216085570,"124 km S of Kokopo, Papua New Guinea",4.1,2000,Papua New Guinea
2,978208040890,"118 km S of Kokopo, Papua New Guinea",4.3,2000,Papua New Guinea
3,978205372710,"16 km S of Yonakuni, Japan",5.1,2000,Japan
4,978199568420,Fiji region,3.9,2000,Fiji


In [24]:
earthquake_df.drop(columns = ['time', 'place'], inplace = True)

In [25]:
earthquake_df.head()

Unnamed: 0,mag,year,Location
0,4.6,2000,Afghanistan
1,4.1,2000,Papua New Guinea
2,4.3,2000,Papua New Guinea
3,5.1,2000,Japan
4,3.9,2000,Fiji


# Combining both

In [26]:
earthquake_suicide_df = earthquake_df.merge(right =  suicide_df, how = 'outer', on = ['Location', 'year'])
# earthquake_suicide_df.dropna()

In [27]:
earthquake_suicide_df

Unnamed: 0,mag,year,Location,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,4.6,2000,Afghanistan,7.71,4.44,12.48
1,4.0,2000,Afghanistan,7.71,4.44,12.48
2,4.2,2000,Afghanistan,7.71,4.44,12.48
3,4.2,2000,Afghanistan,7.71,4.44,12.48
4,3.6,2000,Afghanistan,7.71,4.44,12.48
...,...,...,...,...,...,...
263649,,2000,Liberia,8.79,4.82,15.48
263650,,2000,Niger,9.51,5.15,16.28
263651,,2000,Guinea,9.73,5.57,15.96
263652,,2000,Ghana,9.75,6.46,14.30


In [29]:
earthquake_suicide_df[earthquake_suicide_df["Location"] == 'Japan']

Unnamed: 0,mag,year,Location,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
1420,5.1,2000,Japan,18.08,17.45,18.61
1421,3.8,2000,Japan,18.08,17.45,18.61
1422,4.6,2000,Japan,18.08,17.45,18.61
1423,4.0,2000,Japan,18.08,17.45,18.61
1424,3.9,2000,Japan,18.08,17.45,18.61
...,...,...,...,...,...,...
255528,4.3,2022,Japan,,,
255529,4.4,2022,Japan,,,
255530,4.8,2022,Japan,,,
255531,4.6,2022,Japan,,,


In [31]:
earthquake_suicide_df.dropna(inplace = True)