In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import chart_studio.plotly as py
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
init_notebook_mode(connected=True)

In [2]:
suicide_data = pd.read_csv('master.csv')

I first of all want to divide the data into developed and under developed countries so as to see the trend in suicide rate in both divisions

In [3]:
suicide_data.loc[suicide_data['gdp_per_capita ($)']>= 5000, 'Development'] = 'Developed'
suicide_data.loc[suicide_data['gdp_per_capita ($)']< 5000, 'Development'] = 'Under_developed/developing'

In [4]:
suicide_data.head(5)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation,Development
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X,Under_developed/developing
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent,Under_developed/developing
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X,Under_developed/developing
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation,Under_developed/developing
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers,Under_developed/developing


In [5]:
suicide_data.rename(columns = {'suicides/100k pop':'suicides_per_pop'}, inplace = True)

In [6]:
suicide_datadev = suicide_data.groupby('Development').suicides_per_pop.sum()

Now i want to visualize the data using suicide per population cause according to bros Haggai, na wetin we suppose use

In [7]:
px.bar(suicide_datadev)

From the chart above we can see that the suicide rate in developed countries Doubles that of under developed countries
The question is what then is the cause? Having a high gdp per capita and high HDI(Human Development Index) as a consequence ideally shows that individuals living in these countries are in better living conditions than others with lower gdp per capita.
From the Above chart, I believe there is a sort of correlation between gdp and suicide rate. As opposed to what one would generally expect, this relationship is negative. Now lets see the countries with the highest suicide per population

In [8]:
number_ofsuicides_perCountry = suicide_data.groupby('country').suicides_per_pop.sum()

Basically what i want to do is to find the country with the highest suicide per population. I am doing it manually because i do not know the code to use to check it.

In [9]:
number_ofsuicides_perCountrysort = number_ofsuicides_perCountry.sort_values()

In [10]:
number_ofsuicides_perCountrysort.head(1)

country
Dominica    0.0
Name: suicides_per_pop, dtype: float64

In [11]:
number_ofsuicides_perCountrysort.tail(10)

country
Belgium                7900.50
Finland                7924.11
Japan                  8025.23
Ukraine                8931.66
Austria                9076.23
Republic of Korea      9350.45
Kazakhstan             9519.52
Hungary               10156.07
Lithuania             10588.88
Russian Federation    11305.13
Name: suicides_per_pop, dtype: float64

Using suicide per population, it turns out US is not the second when it comes to suicide. But Russia remains first though.
Lets now see if the suicide rate in the US has reduced or worsened

In [12]:
United_States_data = suicide_data[suicide_data.country == 'United States']

In [13]:
United_States_data_1 = United_States_data.copy() 

In [14]:
Suicide_by_yearUS = suicide_data.groupby('year').suicides_per_pop.sum() 

In [15]:
px.bar(Suicide_by_yearUS)

From the chart above, as opposed to what i assumed and initially postulated, and following the advice of Haggai, it turns out the suicide rate in the US has greatly reduced. But let me do a pie chart for the other metrics(age and sex)

In [16]:
px.pie(United_States_data, values='suicides_per_pop', names='age', title='suicide rate between ages')

That means suicide rate is higher among older adults than younger adults in the US and i dare say, in the world.

In [17]:
px.pie(United_States_data, values='suicides_per_pop', names='sex', title='suicide rate amongst genders')


Wow!! 83.1% is a huge number.


Now let us (or let me) analyze the 10 countries with the highest suicide per population

In [18]:
px.line(United_States_data,x=United_States_data.year,y=United_States_data.suicides_per_pop)

From the line plot above, we can see that the suicides per population has actually reduced in the US

I am creating a new dataframe for the countries with the highest number of suicides per population

In [19]:
top_10 = ['Belgium','Finland','Japan','Ukraine','Austria','Republic of Korea','Kazakhstan','Hungary','Lithuania','Russian Federation']

In [20]:
top_10SuicideCountries = suicide_data[suicide_data.country.isin(top_10)]

Now a plot of the suicide rate with respect to age, then sex followed by year.

In [21]:
px.bar(top_10SuicideCountries,x =top_10SuicideCountries.country, y = top_10SuicideCountries.suicides_per_pop,color=top_10SuicideCountries.age, barmode = 'group')

In [22]:
px.bar(top_10SuicideCountries,x =top_10SuicideCountries.country, y = top_10SuicideCountries.suicides_per_pop,color=top_10SuicideCountries.sex, barmode = 'group')# by sex

In [24]:
by_year = top_10SuicideCountries.groupby('year').suicides_per_pop.sum()

In [25]:
px.bar(by_year)

In [26]:
px.bar(top_10SuicideCountries,x =top_10SuicideCountries.country, y = top_10SuicideCountries.suicides_per_pop, barmode = 'group')

In [27]:
px.pie(top_10SuicideCountries, values='suicides_per_pop', names='country', title='suicide rate amongst top 10 countries')


### FINAL ANALYSIS
I tried analyzing the suicide rate for under developed and developed countries. As away of distinguishing both of them, i used their GDP per capita. Those with GDP per capita greater than $4999 were developed and those lesser were under developed. Contrary to expections, countries with low GDP and possibly low life expetancy rate tend to commit suicides less than countries with higher GDP. Further analysis showed that although the suicide rate has increased (as a result of the population), the suicide per population has dropped down drastically in the United States. That means contrary to what I initially thought, the world is better now than it was a couple of decades ago.
As expected, the number suicides were mostly adult men (estimated to be 83 percent in the US and somewhat in that range in other countries) which reinstates the fact that life as a whole is difficult for men.
Russia with a very large population has the highest number of suicides per population with a total of over 11000. Analysis was done on the ten countries with the highest suicide rate and a common occurence was that people between the ages of 35 upwards were more likely to kill themselves. There are many reasons to this. Could be middle age crisis, giving that they no longer enjoy life or maybe there's nothing else to live for or they may have been burdened with responsibilities they couldn't handle(the most likely cause).
In conclusion, the rate of suicides in men needs to be looked at as it is disproportionately larger than that of women. Mens mental health should be taken into cognizance and as much as we need to protect women and their rights, we also need to protect men. Same should be done for older adults. 


## PREDICTION
### Decision Tree Regression
I am now going to attempt to predict the suicide number in the United States. The coulmns I am going to be using are 'age','gender','population' and maybe GDP per capita/GDP per year, but I am going to check if there are any null values in both of them. If there are, I'm just going to use only 'population','sex' and 'age' with suicide number as my target value.

In [28]:
United_States_data.isnull().sum()

country                 0
year                    0
sex                     0
age                     0
suicides_no             0
population              0
suicides_per_pop        0
country-year            0
HDI for year          252
 gdp_for_year ($)       0
gdp_per_capita ($)      0
generation              0
Development             0
dtype: int64

Great!!!! Since there are no null values present there I am going to include it in the model building. Now let me removing the unnecessary columns.

In [29]:
columns = ['country', 'year','suicides_per_pop','country-year', 'HDI for year','generation','Development']
United_States_DATA = United_States_data.drop(columns = columns)

In [30]:
United_States_DATA.head(1)

Unnamed: 0,sex,age,suicides_no,population,gdp_for_year ($),gdp_per_capita ($)
26848,male,75+ years,2177,4064000,4346734000000,19693


So my data looks alright with suicides_no as my target value. Now let me transform the age and the sex column to numbers so I can use decision tree regression on them

In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [32]:
label = le.fit_transform(United_States_DATA['age'])
label_1 = le.fit_transform(United_States_DATA['sex'])

In [33]:
label.shape

(372,)

In [34]:
United_States_DATA.drop(['age','sex'], axis = 1, inplace = True)
United_States_DATA['age'] = label
United_States_DATA['sex'] = label_1
United_States_DATA

Unnamed: 0,suicides_no,population,gdp_for_year ($),gdp_per_capita ($),age,sex
26848,2177,4064000,4346734000000,19693,5,1
26849,5302,17971000,4346734000000,19693,4,1
26850,5134,20986000,4346734000000,19693,1,1
26851,6053,26589000,4346734000000,19693,2,1
26852,4267,19962000,4346734000000,19693,0,1
...,...,...,...,...,...,...
27215,1444,21555712,18120714000000,60387,1,0
27216,1132,21633813,18120714000000,60387,0,0
27217,540,11778666,18120714000000,60387,5,0
27218,255,21273987,18120714000000,60387,3,1


Now to divide the data into training data and out of sample data

In [35]:
length = len(United_States_DATA)
main_data = length*0.8

In [36]:
print ('The total length of the data is ',length)
print ('The length of the main data is ',main_data)
print ('The number of observations for the main data is ', round(main_data))
print ('The number of observations for the out of sample data is ', (round(length-main_data)))

The total length of the data is  372
The length of the main data is  297.6
The number of observations for the main data is  298
The number of observations for the out of sample data is  74


In [37]:
US_DATA = United_States_DATA.iloc[:298]
sample_US_DATA = United_States_DATA.iloc[298:]
US_DATA_1= United_States_data_1.iloc[:298]
sample_US_DATA_1 = United_States_data_1.iloc[298:]

Now to train and make prediction

In [38]:
x = US_DATA[['population','gdp_per_capita ($)','age','sex']]
y = US_DATA['suicides_no']

In [39]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 2)

In [40]:
trees = DecisionTreeRegressor(max_depth = 4).fit(x_train,y_train)

In [41]:
print('the training score is Decision Tree is ',(trees.score(x_train, y_train)))
print('the testing score is for Decision Tree is',(trees.score(x_test, y_test)))

the training score is Decision Tree is  0.9478577594194981
the testing score is for Decision Tree is 0.9615891880046905


In [42]:
pred_tree = trees.predict(US_DATA[['population','gdp_per_capita ($)','age','sex']])
pred_sampletree = trees.predict(sample_US_DATA[['population','gdp_per_capita ($)','age','sex']])

Now I want to put the predicted values in the original data then make comparisons with plots

In [43]:
# Main data
US_DATA_1['predicted_suicide_no'] = pred_tree
# Out of sample data
sample_US_DATA_1['predicted_suicide_no'] = pred_sampletree



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### VISUALIZATION OF RESULTS FOR DECISION TREE

In [44]:
Decision_Tree = US_DATA_1['predicted_suicide_no']
Actual_values = US_DATA_1['suicides_no']
frame = pd.DataFrame([Decision_Tree, Actual_values])

In [45]:
df = frame.transpose()
dff = df.sum()

In [46]:
px.bar(dff)

In [49]:
# bar chart of actual and predicted values for the main data
US_DATAactual_1 = US_DATA_1.groupby('year').suicides_no.sum()
US_DATApred_1 = US_DATA_1.groupby('year').predicted_suicide_no.sum()
px.bar(US_DATAactual_1)# actual plot

In [50]:
px.bar(US_DATApred_1)#predicted plot

In [52]:
# bar chart of actual and predicted values for the out of sample data
sample_US_DATAactual_1 = sample_US_DATA_1.groupby('year').suicides_no.sum()
sample_US_DATApred_1 = sample_US_DATA_1.groupby('year').predicted_suicide_no.sum()
px.bar(sample_US_DATAactual_1)# actual plot

In [53]:
px.bar(sample_US_DATApred_1)# predicted plot

### Metrics Evaluation For Decision Tree

In [54]:
mae = mean_absolute_error((US_DATA['suicides_no']), pred_tree)
mse = mean_squared_error((US_DATA['suicides_no']), pred_tree)
r2score_1 = r2_score((US_DATA['suicides_no']), pred_tree)

In [55]:
print ('The mean absolute error using Decision tree is ', mae)
print ('The mean squared error using Decision tree is ', mse)
print ('The r2score using Decision tree is ', r2score_1)

The mean absolute error using Decision tree is  375.44464373850224
The mean squared error using Decision tree is  311797.61904911796
The r2score using Decision tree is  0.952240571494481


## Random Forest

In [56]:
regr = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(x_train,y_train)
pred_Randomtree = regr.predict(US_DATA[['population','gdp_per_capita ($)','age','sex']])
pred_sampleRandomtree = regr.predict(sample_US_DATA[['population','gdp_per_capita ($)','age','sex']])

In [57]:
print('the training score is',(regr.score(x_train, y_train)))
print('the testing score is',(regr.score(x_test, y_test)))

the training score is 0.9990303099787864
the testing score is 0.9955403926391365


In [58]:
# Main data
US_DATA_1['predictedRandom_suicide_no'] = pred_Randomtree
# Out of sample data
sample_US_DATA_1['predictedRandom_suicide_no'] = pred_sampleRandomtree



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### VISUALIZATION OF RESULTS FOR RANDOM FOREST ENSEMBLE

In [59]:
Random_Forest = US_DATA_1['predictedRandom_suicide_no']
frame_2 = pd.DataFrame([Random_Forest, Actual_values])

In [60]:
df_1 = frame_2.transpose()
dff_1 = df_1.sum()

In [62]:
px.bar(dff_1)

In [63]:
# bar chart of actual and predicted values for the main data
US_DATAactual_1 = US_DATA_1.groupby('year').suicides_no.sum()
US_DATApred_2 = US_DATA_1.groupby('year').predictedRandom_suicide_no.sum()
px.bar(US_DATAactual_1)# actual plot

In [65]:
px.bar(US_DATApred_2)

Wow!!! This is pretty accurate

In [66]:
# bar chart of actual and predicted values for the out of sample data
sample_US_DATAactual_1 = sample_US_DATA_1.groupby('year').suicides_no.sum()
sample_US_DATApred_2 = sample_US_DATA_1.groupby('year').predictedRandom_suicide_no.sum()
px.bar(sample_US_DATAactual_1)# actual plot

In [67]:
px.bar(sample_US_DATApred_2)

### Metrics Evaluation For Random Forest Ensemble

In [68]:
maeR = mean_absolute_error((US_DATA['suicides_no']), pred_Randomtree)
mseR = mean_squared_error((US_DATA['suicides_no']), pred_Randomtree)
r2score_1R = r2_score((US_DATA['suicides_no']), pred_Randomtree)

In [69]:
print ('The mean absolute error using Random Forest is ', maeR)
print ('The mean squared error using Random Forest is ', mseR)
print ('The r2score using Random Forest ', r2score_1R)

The mean absolute error using Random Forest is  66.1936577181208
The mean squared error using Random Forest is  13076.809062080534
The r2score using Random Forest  0.9979969669768954


From this analysis, it seems random forest gives better predictions than Decision Tree on its own (well atleast using this dataset).

## Gradient Boosted Tree

In [70]:
GBR = GradientBoostingRegressor(n_estimators = 3, max_depth = 3, learning_rate = 1).fit(x_train,y_train)
pred_GBR = GBR.predict(US_DATA[['population','gdp_per_capita ($)','age','sex']])
pred_sampleGBR = GBR.predict(sample_US_DATA[['population','gdp_per_capita ($)','age','sex']])

In [71]:
print('the training score is',(GBR.score(x_train, y_train)))
print('the testing score is',(GBR.score(x_test, y_test)))

the training score is 0.9831407621764111
the testing score is 0.9541056090203517


In [72]:
US_DATA_1['predictedGBR_suicide_no'] = pred_GBR
# Out of sample data
sample_US_DATA_1['predictedGBR_suicide_no'] = pred_sampleGBR



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### VISUALIZATION OF RESULTS FOR GRADIENT BOOSTING MACHINE

In [73]:
Gradient_Boosting_Tree = US_DATA_1['predictedGBR_suicide_no']
frame_3 = pd.DataFrame([Gradient_Boosting_Tree, Actual_values])

In [74]:
df_2 = frame_3.transpose()
dff_2 = df_2.sum()

In [75]:
px.bar(dff_2)


In [77]:
# actual data plot
US_DATApred_3= US_DATA_1.groupby('year').predictedGBR_suicide_no.sum()
px.bar(US_DATAactual_1)# actual plot

In [78]:
px.bar(US_DATApred_3) # predicted data plot

In [79]:
# bar chart of actual and predicted values for the out of sample data
sample_US_DATApred_3 = sample_US_DATA_1.groupby('year').predictedGBR_suicide_no.sum()
px.bar(sample_US_DATAactual_1)# actual plot

In [80]:
px.bar(sample_US_DATApred_3)

### Metrics Evaluation For Gradient Boosting Tree 

In [81]:
maeG = mean_absolute_error((US_DATA['suicides_no']), pred_GBR)
mseG = mean_squared_error((US_DATA['suicides_no']), pred_GBR)
r2score_1G = r2_score((US_DATA['suicides_no']), pred_GBR)

In [82]:
print ('The mean absolute error using Gradient Boosting Machine is ', maeG)
print ('The mean squared error using Gradient Boosting is ', mseG)
print ('The r2score using Random Forest ', r2score_1G)

The mean absolute error using Gradient Boosting Machine is  253.5752532314886
The mean squared error using Gradient Boosting is  165866.8242754853
The r2score using Random Forest  0.9745934405798827


## Summary of Results

In [83]:
result = (pd.DataFrame([Actual_values,Decision_Tree,Random_Forest,Gradient_Boosting_Tree])).transpose()

In [84]:
result.head(10)

Unnamed: 0,suicides_no,predicted_suicide_no,predictedRandom_suicide_no,predictedGBR_suicide_no
26848,2177.0,3789.605263,2252.59,2285.908689
26849,5302.0,3789.605263,5297.36,5160.15336
26850,5134.0,4791.882353,5110.27,4819.932426
26851,6053.0,6745.333333,6342.98,7797.913507
26852,4267.0,3802.2,4260.86,4432.567781
26853,2105.0,2160.25,2131.26,1970.320806
26854,1568.0,1270.086957,1555.31,1636.207863
26855,466.0,298.866667,515.04,468.306761
26856,1242.0,1270.086957,1198.44,1792.053507
26857,854.0,779.8,846.71,693.12159


In [85]:
Metrics = {'MODEL':['Decision_Tree','Random_Forest', 'Gradient_Boosting'],'Mean_Absolute_Error':[mae,maeR,maeG],'Mean_Squared_Error':[mse,mseR,mseG],'r2score':[r2score_1,r2score_1R,r2score_1G]}

In [86]:
Metric_Evaluation = pd.DataFrame(Metrics)
Metric_Evaluation = Metric_Evaluation.set_index('MODEL')
Metric_Evaluation.sort_values('r2score', inplace=True)
Metric_Evaluation

Unnamed: 0_level_0,Mean_Absolute_Error,Mean_Squared_Error,r2score
MODEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Decision_Tree,375.444644,311797.619049,0.952241
Gradient_Boosting,253.575253,165866.824275,0.974593
Random_Forest,66.193658,13076.809062,0.997997


In [87]:
MAE = Metric_Evaluation.groupby('MODEL').Mean_Absolute_Error.sum()

In [88]:
MSE = Metric_Evaluation.groupby('MODEL').Mean_Squared_Error.sum()

In [89]:
R2SCORE = Metric_Evaluation.groupby('MODEL').r2score.sum()

In [91]:
px.bar(MAE,title = 'Mean Absolute Error Comparison')

In [93]:
px.bar(MSE, title = 'Mean Squared Error Comparison')

In [95]:
px.bar(R2SCORE,title = 'r2score Comparison')

## Analysis of results
I attempted making predictions on the suicide data gotten from kaggle. Emphasis was made on the United States. I chose it for no particular reason. 
I used decision tree, random forest and gradient boosted tree. They all gave very accurate predictions(maybe its because it was in the US as suggested by Haggai). But upon completion of this project and from the summary in the previous section, you can clearly see that random forest gave us a more accurate prediction. Although that is for this project alone, it could be different for another dataset. So although generally speaking, we cannot say which of the models (I think the ensembles are better than the decision tree on its own though) will give the best prediction, but for this project, RANDOM FOREST won the day.

Learning how to use it in functions and classes