In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

import warnings
warnings.simplefilter(action='ignore')

import plotly.io as pio
pio.templates.default = "plotly_dark"

In [None]:
data = pd.read_csv('/kaggle/input/suicide-rates-overview-1985-to-2016/master.csv')
data.head()

In [None]:
data.shape

* Dataset have 12 columns and total of 27820 samples
* We can see there are NaN values in HDI column.
* country-year is just the combination of country and year.
* As we do have country and year column separately, so country-year won't be needed for analysis, but it can be used for groupby

In [None]:
data.info()

## Numeric Features
> Integer
* year
* suicides_no
* population
* gdp_per_capita 

> Float  
* sucides/100k pop
* HDI_for_year
 
## Categorical Features
* country
* sex
* age
* country-year
*  gdp_for_year
* generation

## Univerate Analysis

### Numerical Features

In [None]:
num = data.select_dtypes(['int64', 'float64']).columns


In [None]:
def num_plot(df, columns, shape):
  n = shape[0]
  m = shape[1]
  plt.figure(figsize=(m*10, n*7))
  for i, col in enumerate(columns):
    plt.subplot(n, m, i+1)
    sns.distplot(df[col])
    plt.xlabel(col, fontweight='bold', size=16)
    plt.ylabel('Density', fontweight='bold', size=16)
    ax = plt.gca()
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.tick_params(axis='both', which='minor', labelsize=14)


In [None]:
num_plot(data, num, (2,3))

In [None]:
data[num].describe().apply(lambda s: s.apply('{0:.3f}'.format))

* range of year is 1985-2016. We can see from density plot, data distribution is not uniform across year.
* min sucides_no. is 0 and max is 22338 with mean 242. So there are lots of data close to 0. 
* 75% of data is less than 131. Plot says the same
* Population is also distributed in large range. ( 278 -- 4 x 10^7). With more near 278.
* suicides/100k pop has range 0 - 224 with mean 12 and 75% less than 16.
* HDI has a lot of missing values. around 66%. 
* gdp_per_capita ranges from 251 to 1.2 x 10^5 with mean of 2.4 x 10^4. It is also rightly skewed distributed.


### Futher Analysis (will do afterwards)

* population vs suicides_no.: Do countries with high populations have more suicides_no.
* population vs suicide/100k pop: country with high population have the high ratio?
* gdp_per_capita vs suicides_no: lesser the gdp higher the suicides_no.?
* gdp_per_capital vs population: country with high population have high gdp_per_capital?
* gdp_per_capital vs suicides_no.: country with high gdp_per_capita has improved over year in terms of sucides_no.

### Categorical Features Analysis

In [None]:
cat_column = data.select_dtypes(['object']).columns
cat_column

In [None]:
data.drop(columns=['country-year', ' gdp_for_year ($) '], inplace=True)
cat_column = data.select_dtypes(['object']).columns
cat_column

In [None]:
country_wise = pd.DataFrame(data['country'].value_counts().sort_values(ascending=False)).reset_index()
country_wise.columns = ['country', 'count']

In [None]:
plt.figure(figsize=(20, 6))
ax = plt.gca()
country_wise.iloc[:25].plot(kind='bar', x='country', y='count', ax=ax)
plt.xticks(rotation=60)
plt.xlabel('country', fontweight='bold', size=14)
plt.ylabel('count', fontweight='bold', size=14)

In [None]:
plt.figure(figsize=(20, 6))
ax = plt.gca()
country_wise.iloc[-25:].plot(kind='bar', x='country', y='count', ax=ax)
plt.xticks(rotation=60, ha='right')
plt.xlabel('country', fontweight='bold', size=14)
plt.ylabel('count', fontweight='bold', size=14)

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(16,24))
for i,col in enumerate(['sex', 'age', 'generation']):
  # plt.subplot(3,2, 2*i+1)
  # plt.figure(figsize=(8,6))
  # ax = plt.gca()
  a = sns.countplot(data=data, x=col, ax=ax[i][0])
  a.set_xlabel(col, fontweight='bold', size=14)
  a.set_ylabel('count', fontweight='bold', size=14)

  #plt.show()
  df = data[col].value_counts()
  # plt.subplot(3,2, 2*i +2)
  # plt.figure(figsize=(8,6))
  w, _, _ = ax[i][1].pie(df, autopct='%.1f%%', textprops=dict(color="w", size=14))
  ax[i][1].legend(w, df.index,
          title="Legend",
          #loc="left",
          bbox_to_anchor=(.75, .1, .5, 1))
  # plt.show()


* Data for every country for each year is not present.
* For some countries, there is around 380 instances while some have 20
* Data have equal samples for men and women.
* Data is also distributed equally across different age group
* From above two point we can assume that, if there is a data for country-year combination, than it's divided into Sex and age groups and available in data.
* After some google search I found that, generations are divided based on the year of born as follows:
> * G.I. generation: 1901 - 1927
> * Silent : 1928 - 1945
> * Boomers: 1946 - 1964
> * Generation X : 1965 - 1980
> * Millenials: 1981 - 1996
> * Generation Z: 1997 - 2013
* More data available for Generation X and Silent while Generation Z has the minimum data.


### Further Analysis (will do afterwards)
* country vs. suicide_no. : there are contries whose very less amount of data is present. is it because there are no dead during that period.
* Sex vs. Suicides_no. : Do suicides_no. are also same between men and women.
* Age vs. suicides_no. : why there are divided equally across age group?
* generation vs. suicides_no. : do the suicides_no. is also more for generation X and silent.

## Conclusions:

* range of year is 1985-2016. We can see from density plot, data distribution is not uniform across year.
* min sucides_no. is 0 and max is 22338 with mean 242. So there are lots of data close to 0. 
* 75% of data is less than 131. Plot says the same
* Population is also distributed in large range. ( 278 -- 4 x 10^7). With more near 278.
* suicides/100k pop has range 0 - 224 with mean 12 and 75% less than 16.
* HDI has a lot of missing values. around 66%. 
* gdp_per_capita ranges from 251 to 1.2 x 10^5 with mean of 2.4 x 10^4. It is also rightly skewed distributed.
* Data for every country for each year is not present.
* For some countries, there is around 380 instances while some have 20
* Data have equal samples for men and women.
* Data is also distributed equally across different age group
* From above two point we can assume that, if there is a data for country-year combination, than it's divided into Sex and age groups and available in data.
* More data available for Generation X and Silent while Generation Z has the minimum data.


### Further Analysis Required: 
* population vs suicides_no.: Do countries with high populations have more suicides_no.
* population vs suicide/100k pop: country with high population have the high ratio?
* gdp_per_capita vs suicides_no: lesser the gdp higher the suicides_no.?
* gdp_per_capital vs population: country with high population have high gdp_per_capital?
* gdp_per_capital vs suicides_no.: country with high gdp_per_capita has improved over year in terms of sucides_no.
* country vs. suicide_no. : there are contries whose very less amount of data is present. is it because there are no dead during that period.
* Sex vs. Suicides_no. : Do suicides_no. are also same between men and women.
* Age vs. suicides_no. : why there are divided equally across age group?
* generation vs. suicides_no. : do the suicides_no. is also more for generation X and silent.

## Analysis

In [None]:
data.corr()

# 1)

In [None]:
df = data.groupby('country')['population', 'suicides_no'].mean()
import plotly.express as px

px.scatter(df, x='population', y='suicides_no', color=df.index)

* Definetly there is a positive corelation between 'population' and 'suicides_no'. We can see that in scatter plot as well as correlation metrix.
* Despite the positive correlation, Russian Federation (Russia), has half the population than United State, have double suicides_no than it.
* Some of reasons are:
 - Lack of economic prospects, as well as cultural and spiritual emptiness, are the two leading causes of suicide, according to Temyr Hagurov, a leading researcher at the Russian Academy of Sciences’ Institute of Sociology.

 - Demand for meaning is highly characteristic of Russian culture; a Russian person cannot live without meaning. - Hagurov

# 2)

In [None]:
df = data.groupby(['country'])['population', 'suicides/100k pop'].mean()
px.scatter(df, x='population', y='suicides/100k pop', color=df.index)

 * There is no specific pattern appear between this two variables. Even the correaltion value between these two features is 0.008.
 * Lithuania has the highest 'suicides/100l pop' despite having less population.
  * Social and financial problems in Lithuania are thought to be important factors behind the high rate of suicides. [more reasons](https://en.wikipedia.org/wiki/Suicide_in_Lithuania)
 * Sri Lanka also have high suicides/100k pop.
  * Poverty is the major factor for this. 


## 3)

In [None]:
df = data.groupby(['country'])['gdp_per_capita ($)', 'suicides/100k pop'].mean()
px.scatter(df, x='gdp_per_capita ($)', y='suicides/100k pop', color=df.index)

 * There is not any relation between 'suicides/100k pop' and 'gdp'.
 * Countries like United Emirates and Qatar have less suicides and high gdp.

# 4)

In [None]:
df = data.groupby(['country'])['gdp_per_capita ($)', 'population'].mean()
px.scatter(df, x='population', y='gdp_per_capita ($)', color=df.index)

* There is no relation between gdp and population