# Suicide Rate Analysis

## Introduction

Close to 800 000 people die due to suicide every year, which is one person every 40 seconds. Suicide is a global phenomenon and occurs throughout the lifespan. Effective and evidence-based interventions can be implemented at population, sub-population and individual levels to prevent suicide and suicide attempts. There are indications that for each adult who died by suicide there may have been more than 20 others attempting suicide.

Suicide is a complex issue and therefore suicide prevention efforts require coordination and collaboration among multiple sectors of society, including the health sector and other sectors such as education, labour, agriculture, business, justice, law, defense, politics, and the media. These efforts must be comprehensive and integrated as no single approach alone can make an impact on an issue as complex as suicide.

## Objective

* This Project is aimed at accessing Mental health generally by analyzing global suicide rate data 
* Predicting sentiment and Suicide intentions in tweets from Twitter.com

## Install & Import Libraries

In [None]:
pip install chart_studio

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas_profiling import ProfileReport
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
sns.set_palette("pastel")
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from chart_studio import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go 
import plotly.express as px 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.tools as ctls
%matplotlib inline

#log in to plotly

username = "***"
api_key = "***"

ctls.set_credentials_file(username = username, api_key = api_key)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Dataset Preview


In [None]:
df_age = pd.read_csv("../input/mental-health-and-suicide-rates/Age-standardized suicide rates.csv")
df_age.head()

In [None]:
age_report = ProfileReport(df_age, title = "Age Standardized Suicide Rate")
age_report.to_widgets()
age_report.to_file("Age Standardized Suicide Rate.html")

In [None]:
df_sr = pd.read_csv("../input/suicide-rates-overview-1985-to-2016/master.csv")
df_sr.head()

In [None]:
sr_report = ProfileReport(df_sr, title = "Suicide Rate")
sr_report.to_widgets()
sr_report.to_file("Suicide Rate.html")

In [None]:
df_sro = pd.read_csv("../input/suicide-rates-overview-1985-to-2016/master.csv")
df_sro.head()

In [None]:
sro_report = ProfileReport(df_sro, title = "Suicide Rate overview")
sro_report.to_widgets()
sro_report.to_file("Suicide Rate Overview.html")

## Exploratory Data Analysis

In [None]:
#Replace empty spaces in sex values
df_age.replace([" Both sexes", " Male", " Female"], ["Both sexes", "Male", "Female"], inplace = True)

In [None]:
#Both sexes dataset
df_age_both = df_age[df_age["Sex"] == "Both sexes"]
df_age_both.head()

In [None]:
#Male dataset
df_age_male = df_age[df_age["Sex"] == "Male"]
df_age_male.head()

In [None]:
#Female dataset
df_age_female = df_age[df_age["Sex"] == "Female"]
df_age_female.head()

In [None]:
#Plot Global Sum Suicide Rate by Year

global_mean = df_age_both[["2000", "2010", "2015", "2016"]].sum()
plt.figure(figsize=(10, 5))
plt.title("Global Sum Suicide Rate by Year")
global_mean.plot(marker = "o")

In [None]:
#Plot Global Sum of Suicide Rate by Year by Gender
plt.figure(figsize=(10, 5))
plt.title("Global Sum of Suicide Rate by  Gender")
df_age_male[["2000", "2010", "2015", "2016"]].sum().plot(marker = "o")
df_age_female[["2000", "2010", "2015", "2016"]].sum().plot(marker = "x")
plt.legend(["Male", "Female"])

In [None]:
#extract Data For NIgeria
nigf = df_age_female[df_age_female["Country"] == "Nigeria"]
nigm = df_age_male[df_age_male["Country"] == "Nigeria"]

In [None]:
#Plot Nigeria Sum of Suicide Rate by Year
plt.figure(figsize=(10, 5))
plt.title("Nigeria Sum of Suicide Rate by Year")
nigm[["2000", "2010", "2015", "2016"]].sum().plot(marker = "o")
nigf[["2000", "2010", "2015", "2016"]].sum().plot(marker = "x")
plt.legend(["Male", "Female"])

In [None]:
df_age.head()

In [None]:
df_age["Sum"] = df_age.sum(axis = 1)
df_age.head()

In [None]:
df_age_male["Sum"] = df_age_male.sum(axis = 1)
df_age_male.head()

In [None]:
df_age_male_sort = df_age_male.sort_values("Sum", ascending = False)
df_age_male_sort.head()

In [None]:
df_age_female["Sum"] = df_age_female.sum(axis = 1)
df_age_female.head()

In [None]:
df_age_female_sort = df_age_female.sort_values("Sum", ascending = False)
df_age_female_sort.head()

In [None]:
df_age_both["Sum"] = df_age_both.sum(axis = 1)
df_age_both.head()

In [None]:
df_age_both_sort = df_age_both.sort_values("Sum", ascending = False)

In [None]:
df_age_both_sort.head()

In [None]:
male_lst = list(df_age_male_sort[:10].Country)
male_lst

In [None]:
#Plot Top 10 country with highest Mean Suicide Rate
plt.figure(figsize=(20, 7))
data= df_age_both_sort[:10]
data2 = df_age_male_sort[:10]
data3 = df_age_female_sort[:10]
plt.title("Top 10 country with highest Sum Suicide Rate")
sns.set_style("white")
sns.lineplot(x="Country", y= "Sum", data= data2, marker = "o")
sns.lineplot(x=male_lst, y="Sum", data= data, marker = "o")
sns.lineplot(x= male_lst, y= "Sum", data= data3, marker = "o")
plt.legend(["Both", "Male", "Female"])

plt.show()

In [None]:
df_count = df_age[df_age["Sex"] != "Both sexes"]
df_count.head()

In [None]:
df_age_male_sort.head()

In [None]:
df_age_female_sort.head()

In [None]:
df_age_both_sort.head()

In [None]:
data=df_count.sort_values("Sum", ascending = False)
data.head()

In [None]:
#sns.set_palette("pastel")
colors = sns.color_palette('pastel')[0:5]
plt.figure(figsize=(20, 15
                ))
plt.title("World Suicide Rate")
sns.barplot(y="Country", x="Sum", data=df_age_male_sort[:50], color = "lightblue", estimator=sum, ci=None)
sns.barplot(y="Country", x="Sum", data=df_age_female_sort[:50], color = "darkblue")


top_bar = mpatches.Patch(color='darkblue', label='Female')
bottom_bar = mpatches.Patch(color='lightblue', label='Male')
plt.legend(handles=[top_bar, bottom_bar])



Comparison of top 10 countries with highest male suicide rate

In [None]:
df_sr = pd.read_csv("../input/suicide-rates-overview-1985-to-2016/master.csv")
df_sr.head()

In [None]:
df_srs = df_sr.groupby(["year", "age"])["suicides_no"].sum()
df_srs.head()

In [None]:
df_srs = df_srs.reset_index()
df_srs = df_srs[df_srs["year"] != 2016]
df_srs.tail()

In [None]:
plt.figure(figsize=(20,10))
plt.title("Suicide Rate by Age Group")

sns.lineplot(x = "year", y= df_srs.suicides_no, hue = "age",
             data = df_srs, linewidth = 2.5, style = "age", markers=True, dashes=False)
plt.xticks(rotation = 90)
plt.show()

In [None]:
df_g = df_sr.groupby(["year", "generation"])["suicides_no", "population"].sum()
df_gen = df_g.copy().reset_index()
df_gen.head()

In [None]:
plt.figure(figsize=(15,6))
plt.title("Suicide Rate by Generation")

sns.lineplot(x = "year", y= df_gen.suicides_no*100/df_gen.population, hue = "generation",
             data = df_gen, linewidth = 2.5, style = "generation", markers=True, dashes=False)
plt.xticks(rotation = 90)
plt.show()

## Simple Linear Regression

In [None]:
df_age

In [None]:
df_reg = df_age.sort_values("Sum", ascending = False)
df_reg.reset_index(drop = True, inplace = True)
df_reg.head()

In [None]:
sex = pd.get_dummies(df_reg["Sex"], drop_first = True)
df_reg = pd.concat([df_reg, sex], axis = 1)
df_reg.head()

In [None]:
df_reg.drop(["Sex", "Female"], axis = 1, inplace  = True)

In [None]:
df_reg_melt = pd.melt(df_reg, id_vars = ["Country", "Male", "Sum"], var_name = "Year", value_name = "SR")
df_reg_melt.drop(columns = "Sum", inplace = True)

In [None]:
df_reg_melt

In [None]:
#Split Data
x =df_reg_melt[['Male','Year']]
y=df_reg_melt['SR']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model_lr = LinearRegression()

In [None]:
model_lr.fit(x_train, y_train)
pred = model_lr.predict(x_test)

In [None]:
#save model
with open('Linear_reg', 'wb') as picklefile:
    pickle.dump(model_lr,picklefile)

In [None]:
model_lr.coef_

In [None]:
#Error checks
MAE= mean_absolute_error(y_test,pred)
MSE= mean_squared_error(y_test,pred)
RMSE= np.sqrt(mean_squared_error(y_test,pred))
r2= r2_score(y_test,pred)

In [None]:
print( "MAE:", MAE)
print( "MSE:", MSE)
print( "RMSE:", RMSE)
print( "r2:", r2)

In [None]:
coeff= pd.DataFrame(model_lr.coef_,x.columns,columns=['Coefficient'])
coeff

r2 score isnt big enough to extract any solid conclusions, but results indicate the following:

Being male increases suicide rate by 8.08 points

For every year we go back from this dataset (2016,2015,2010,2000) the suicide rate goes up by 0.154

Formular for calculating the suicide rate of a Male will therefore be expressed as: 

### y = mx + c
### y = 8.01x - 0.154

considering all things being equal!

## Worldwide Sucide Map 2016

In [None]:
#2016
df_2016=df_age_both.drop(columns=['2015','2010','2000'])
df_2016=pd.melt(df_2016,id_vars=['Country','Sex'],var_name='Year',value_name='sr')


In [None]:
cloro= dict(type='choropleth',
            locations=df_2016['Country'],
            locationmode='country names',
            z=df_2016['sr'],
            text=df_2016['Country'],
            colorscale='blues_r',
            reversescale=True,
            colorbar={'title':'suicide rate per 100.000 humans'})

layout= dict(title= 'Worldwide Suicide rate in 2016',
             geo= dict(showframe=True,
                       showcoastlines=True,
                      projection={'type':'miller'}))

choromap3= go.Figure(data=[cloro],layout=layout)

mapplot = iplot(choromap3)
mapplot

#py.plot(mapplot, filename = 'Worldwide Suicide rate in 2016', auto_open = False)

In [None]:
choromap3.write_html("suicidemap.html")

In [None]:
df_sro = pd.read_csv("../input/suicide-rates-overview-1985-to-2016/master.csv")
df_sro.head()

In [None]:
df_sro_sex = pd.DataFrame(df_sro[["country", "sex","suicides_no"]], columns = ["country", "sex", "suicides_no"])
df_sro_sex.head()

In [None]:
df_sro_m = df_sro_sex[df_sro_sex["sex"] == "male"]
df_sro_m.head()

In [None]:
df_sro_f = df_sro_sex[df_sro_sex["sex"] == "female"]
df_sro_f.head()

In [None]:
msum = df_sro_m["suicides_no"].sum()
msum

In [None]:
fsum = df_sro_f["suicides_no"].sum()
fsum

In [None]:
#pie chart of world suicide rate by sex 1985-2016

data = [msum, fsum]
labels = ["Male", "Female"]
colors = sns.color_palette('pastel')[0:5]

plt.figure(figsize=(10, 7))
plt.title("World Suicide by Gender (1985-2015)", fontsize=14);
plt.pie(data, labels = labels, colors = colors, explode=[0.01]*2, autopct='%.1f%%')
plt.show()

In [None]:
df_sro.head()

In [None]:
df_sro_age = df_sro[["age", "suicides_no"]]
df_sro_age.head()

In [None]:
lst = []
for i in df_sro_age["age"].unique():
    dat = df_sro_age[df_sro_age["age"] == i]
    lst.append(dat)
    print(lst)

In [None]:
#pie chart of world suicide rate by age group 1985-2016

data = [lst[0]["suicides_no"].sum(), lst[1]["suicides_no"].sum(), lst[2]["suicides_no"].sum(),
       lst[3]["suicides_no"].sum(), lst[4]["suicides_no"].sum(), lst[5]["suicides_no"].sum()]
labels = ['15-24 years', '35-54 years', '75+ years', '25-34 years',
       '55-74 years', '5-14 years']
colors = sns.color_palette('pastel')[0:6]

plt.figure(figsize=(10, 7))
plt.title("World Suicide by Age (1985-2015)", fontsize=14);
plt.pie(data, labels = labels, colors = colors, explode=[0.01]*6, autopct='%.1f%%')
plt.show()

In [None]:
df_sro.head()

In [None]:
df_sro_year = df_sro[["year", "age", "suicides/100k pop"]]
df_sro_year.rename(columns = {"suicides/100k pop": "sr/100"}, inplace = True)
df_sro_year

In [None]:
df_sro_year["year"].unique()

In [None]:
collect = []
for i in df_sro_year["year"].unique():
    dat = df_sro_year[df_sro_year["year"] == i]
    cal = round( dat["sr/100"].sum(), 2)
    collect.append(cal)
print(collect)

In [None]:
len(collect)

In [None]:
year = pd.DataFrame([df_sro_year["year"].unique(), collect])
year.head()

In [None]:
year = year.T
year.rename(columns = {0 : "year", 1 : "sr/100k"}, inplace = True)

In [None]:
#Plot Suicide rate per 100k pop
df = year.sort_values("year")[:31]
fig = px.line(df, x="year", y="sr/100k", title='Suicide Rate per 100k pop', markers=True)

fig.show()

#py.plot(fig, filename = 'Suicide Rate per 100k pop', auto_open = False)

In [None]:
df_sro.head()

In [None]:
df_sro_gend = df_sro[["year", "sex", "suicides/100k pop"]]
df_sro_gend.rename(columns = {"suicides/100k pop": "sr/100"}, inplace = True)
df_sro_gend.head()

In [None]:
df_sro_m = df_sro_gend[df_sro_gend["sex"] == "male"]
df_sro_m.head()

In [None]:
collectm = []
for i in df_sro_m["year"].unique():
    dat = df_sro_m[df_sro_m["year"] == i]
    cal = round( dat["sr/100"].sum(), 2)
    collectm.append(cal)
print(collectm)

In [None]:
ma = pd.DataFrame([df_sro_year["year"].unique(), collectm])
ma = ma.T
ma.rename(columns = {0 : "year", 1 : "sr/100k"}, inplace = True)

In [None]:
ma.head()

In [None]:
#Plot Suicide rate per 100k pop
df = ma.sort_values("year")[:31]
fig = px.line(df, x="year", y="sr/100k", title='Suicide Rate per 100k pop Male', markers=True)

fig.show()

#py.plot(fig, filename = 'Suicide Rate per 100k pop Male', auto_open = False)

In [None]:
df_sro_f = df_sro_gend[df_sro_gend["sex"] == "female"]
df_sro_f.head()

In [None]:
collectf= []
for i in df_sro_f["year"].unique():
    dat = df_sro_f[df_sro_f["year"] == i]
    cal = round( dat["sr/100"].sum(), 2)
    collectf.append(cal)
print(collectf)

In [None]:
fe = pd.DataFrame([df_sro_f["year"].unique(), collectf])
fe = fe.T
fe.rename(columns = {0 : "year", 1 : "sr/100k"}, inplace = True)
fe.head

In [None]:
#Plot Suicide rate per 100k pop Female
df = fe.sort_values("year")[:31]
fig = px.line(df, x="year", y="sr/100k", title='Suicide Rate per 100k pop Female', markers=True)

fig.show()

#py.plot(fig, filename = 'Suicide Rate per 100k pop Female', auto_open = False)

In [None]:
ma = ma.sort_values("year")[:31]
fe = fe.sort_values("year")[:31]
md = ma["sr/100k"]
fd = fe["sr/100k"]

# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=ma["year"], y=md,
                    mode='lines+markers',
                    name='male'))
fig.add_trace(go.Scatter(x=ma["year"], y=fd,
                    mode='lines+markers',
                    name='female'))
fig.update_layout(title='Suicide rate per 100 pop by Gender')
                 


fig.show()

#py.plot(fig, filename = 'Suicide Rate per 100k pop by Gender', auto_open = False)

In [None]:
df_sro.head()

In [None]:
df_sro.country.unique()

In [None]:
nig = df_sro[df_sro["country"]  == "Iceland"]
nig

In [None]:
df_cru = pd.read_csv("../input/who-worldhealth-statistics-2020-complete/crudeSuicideRates.csv")
df_cru

In [None]:
both = df_cru[df_cru["Dim1"]  != "Both sexes"]
both.head()

In [None]:
both.drop(["Location"], axis = 1, inplace = True)
both.head()

In [None]:
gend = pd.get_dummies(both["Dim1"], drop_first = True)
both = pd.concat([both, gend], axis = 1)
both.head()

In [None]:
both = both.loc[3:]
both.head()

In [None]:
both.drop("Dim1", axis = 1, inplace = True)

In [None]:
both["Male"] = both["Male"].astype(float)

In [None]:
both.head()

In [None]:
both_x = both[["Male", "Period"]]

In [None]:
pred2 = model_lr.predict(both_x)

In [None]:
pred2

In [None]:
both["Prediction"] = pred2
both

In [None]:
df_age_both.head()

In [None]:
both = both.groupby("Period").sum()
both

In [None]:
both_df = both.copy()
both_df = both_df.reset_index()


In [None]:
both_df.head()

In [None]:
both_df = both_df.drop("Male", axis = 1)
both_df = both_df.rename(columns = {"Period": "Year", "First Tooltip": "Suicide Rate" })
both_df.head()

In [None]:
nig = df_cru[df_cru["Location"]  == "Nigeria"]
nig

In [None]:
nig = nig[["Location", "Period", "Dim1", "First Tooltip"]]
nig.head()

In [None]:
nig.rename(columns = {"Location": "country", "Period" : "year", "Dim1": "sex", "First Tooltip" : "suicide rate"}, inplace = True)
nig.head()

In [None]:
nig = nig[nig.sex != "Both sexes"]
nig.head()

In [None]:
#Plot Nigeria Suicide rate per 100k pop
df = nig[2:]
fig = px.line(df, x="year", y="suicide rate", title='Nigeria Suicide Rate per 100k pop', color = "sex", markers=True)

fig.show()

py.plot(fig, filename = 'Nigeria Suicide Rate per 100k pop', auto_open = False)

In [None]:
df_cru.drop("Indicator", inplace = True, axis = 1)

In [None]:
df_cru_m = df_cru[df_cru["Dim1"] == "Both sexes"]
df_cru_m.head()

In [None]:
df_cru_m.drop("Dim1", axis = 1, inplace = True)

In [None]:
df_cru_m = df_cru_m[df_cru_m.Period != 2016]

In [None]:
df_cru_m.head()

In [None]:
df_cru_m.sort_values("Period")

In [None]:
df_cru_m.Location.unique()

In [None]:
count_lst = ["Afghanistan", "Nigeria", "United States of America", "Liberia", "Canada", "Lesotho", "Zambia", "Ghana", "Russian Federation"]

In [None]:
df_c = df_cru_m['Location'].isin(count_lst)
df_country = df_cru_m[df_c]
df_country

In [None]:
fig2 = px.line(df_country, x="Period", y='First Tooltip', color="Location" )
fig2



In [None]:
py.plot(fig2, filename = 'Suicide Rate per 100k pop for selected countries', auto_open = False)