In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KernelDensity


# 1. Get the suicidal dataset

## 1.1 Load all death data

In [None]:
df_raw = pd.read_csv("2020.csv",low_memory=False)

In [None]:
features= df_raw.columns.tolist()
print(features,end="，")

## 1.2 Fill the NaN 

In [None]:
df = df_raw.fillna(0)

In [None]:
df.describe()

## 1.3 Drop those columns with no data

In [None]:
df = df.loc[:, (df != 0).any(axis=0)]

In [None]:
df.describe()

In [None]:
sum(df['mandeath'] == 7)

## 1.4 Drop the data points that hasn't specified the manner of death

In [None]:
df = df.loc[df['mandeath'] != 0]

In [None]:
df.describe()

#### Manner of Death: 1. accident 2. suicide 3. homicide 4. pending investigation 5. could not determine 6. self-inflicted 7. natural Nan: Not Specified 

In [None]:
df["mandeath"].replace({2: 1, 1: 0,3: 0,4: 0,5: 0,6: 0,7: 0}, inplace=True)

Now we have the modified death data, mandeath == 1 is suicide, mandeath == other values means death caused by other reasons.

In [None]:
suicide = df

In [None]:
suicide

#扔数据
年龄定义在12岁之后，因为之前的年龄都不一定

In [None]:
suicide_str = pd.DataFrame(suicide)
suicide_str['age'] = suicide['age'].apply(str)
age_NotSpecified = list(suicide_str[suicide_str['age'].str.contains('999')].index)
suicide = suicide.drop(index = age_NotSpecified, axis=1)
suicide['age'] = suicide['age'].apply(int)

## 1.5 Decoding the age:
#### 1. if the first digit is 1, then the last three digits represent the age under years.
#### 2. if the first digit is 2, then the last three digits represent the age under months.
#### 3. if the first digit is 4, then the last three digits represent the age under days.
#### 4. if the first digit is 5, then the last three digits represent the age under hours.
#### 5. if the first digit is 6, then the last three digits represent the age under minutes.
#### 6. if any three digits are 999, then the age is not speficied.

Here we filter out the index of keys with unspecified age, and get the index list to drop at next step.

In [None]:
suicide.loc[(suicide['age']>=1000)&(suicide['age']<2000),'age']= suicide['age'] - 1000

In [None]:
suicide.loc[(suicide['age']>=2000)&(suicide['age']<3000),'age']=(suicide['age']-2000)/12

In [None]:
suicide.loc[(suicide['age']>=4000)&(suicide['age']<5000),'age']=(suicide['age']-4000)/365

Although we have people died when they were just a couple of hours old, we actually can ignore that since since they were that young, it is impossible for them to "commit a suicide"

In [None]:
suicide[(suicide['age']>=5000)&(suicide['age']<6000)]
suicide.loc[(suicide['age']>=5000)&(suicide['age']<6000),'age']=(suicide['age']-5000)/24/365

I am so sorry that you died so young babe!

In [None]:
suicide[(suicide['age']>=6000)&(suicide['age']<7000)]
suicide.loc[(suicide['age']>=6000)&(suicide['age']<7000),'age']=(suicide['age']-6000)/60/24/365

We throw away rows with unspecified education levels.

In [None]:
suicide=suicide.loc[suicide['educ2003'] != 9]
suicide=suicide.loc[suicide['marstat'] != 'U']
suicide=suicide.loc[suicide['weekday'] != 9]
suicide=suicide.loc[suicide['placdth'] != 9]
suicide=suicide.loc[suicide['injury'] != 'U']

One hot Encoding for Sex

In [None]:
suicide = suicide.join(pd.get_dummies(suicide[["sex"]]))

One hot Encoding for Maritial Status

In [None]:
suicide = suicide.join(pd.get_dummies(suicide[["marstat"]]))

可能要先encode一下race 然后再去one hot encoding；

In [None]:
print(suicide.columns)
suicide=suicide.drop(columns=['educflag','econds_1','year','methdisp','autopsy',\
                      'ageflag','ager52', 'ager27', 'ager12', 'ager22',\
                      'brace','raceimp', 'racer3', 'racer5', 'hspanicr', 'race40'])

1. ranum: Number of Entity-Axis Conditions 这个列出来了有多少个conditions
2. record_n: position number of nth condition
3. record_n: Sequence of condition within part/line

The record axis codes are assigned in terms of the set of codes that best describe the overall medical certification portion of the death certificate.

Refer to https://www.cdc.gov/nchs/data/datalinkage/underlying_and_multiple_cause_of_death_codes.pdf



In [None]:
suicide["raceforvis"]= suicide["race"]

In [None]:
suicide["race"].replace({0:"Other Race",1: "White", 2: "Black",3: "American Indian",4: "Chinese", 5: "Japanese",\
                         6: "Hawaiian",7: "Filipino",8:"Other Asian or Pacific Islander", 18: "Asian Indian",\
                         28: "Korean", 38: "Samoan",48:"Vietnamese", 58: "Guamanian", 68:"Other Asian",78:"Combined Other Asian"}, inplace=True)

In [None]:
suicide = suicide.join(pd.get_dummies(suicide[["race"]]))

出一个自杀情况下的 ICD-10 编码热力图 横轴（A,B,C,...) 纵轴(10,20,30,...),热力是数量
    这样我们就可以清楚地看到自杀的人的疾病记录大概率集中在哪个部分。
    

Let‘s define a function to split ICD code.

In [None]:
def concat_split(x,width=1):
    result=''
    start=0
    while True:
        s=str(x)[start:start+width]
        
        if s:
            result =result + s 
        else:
            break
        start=start+width
        
    return result[:-1]

In [None]:
ICD_Split = suicide['ucod'].map(concat_split).str.split('',expand=True)

In [None]:
ICD_Split.columns = ['ICD_'+str(i) for i in ICD_Split.columns]

In [None]:
ICD_Split['ICD_3']=ICD_Split['ICD_3'].replace({'':'0'})
ICD_Split['ICD_3']=ICD_Split['ICD_3'].apply(int)
ICD_Split['ICD_4']=ICD_Split['ICD_4'].replace({'':'0',None :'0'})
ICD_Split['ICD_4']=ICD_Split['ICD_4'].apply(int)
suicide = suicide.join(ICD_Split[['ICD_1','ICD_2','ICD_3','ICD_4']])

## 2 Descriptive Analysis

Pie Chart for 自杀人数占总死亡人数百分比

### 2.1 Underlying Cause of Suicide Analysis

#### 2.1.1 Heatmap of ICD-10 encoding system for Suicide cases.

In [None]:
heattable = suicide.pivot_table(index = 'ICD_1', columns = 'ICD_2', values = 'mandeath', aggfunc = np.sum)
plt.figure(figsize=(10, 10))
sns.heatmap(data = heattable,cmap='YlGnBu', annot=True, fmt="g")

In [None]:
print(heattable)

上图解释：X60-X84 这一块是ICD10系统里的X60-X84 故意自害，make sense
于是之后我们可以再dig into X7 这一块儿看看他们自杀的方式有哪些：具体可以查阅
https://zh.m.wikipedia.org/wiki/ICD-10_第二十章：疾病和死亡的外因
这里有一个假设，或许我们可以分析一下某个有自杀倾向的人，会倾向于用什么样的方式自杀，然后可以让他们远离这种东西。

From the Figure above we can see that most of the suicidal cases, the underlying cause of death belongs to X7**

### 2.1.2 Deeper insight into X7

We can dig into Block X7 and take a look inside to see the main method that people use to suicide.

In [None]:
suicide_X=suicide[suicide['ICD_1']=="X"]
suicide_X7=suicide_X[suicide_X['ICD_2']=='7']
heattable_X7 = suicide_X7.pivot_table(index = 'ICD_3', columns = 'ICD_4',values='mandeath',aggfunc=np.sum)
plt.figure(figsize=(10, 10))
sns.heatmap(data = heattable_X7,cmap='YlGnBu',annot = True, fmt="g")

From the Figure above we can see they did not further define 

### 2.1.3 Other underlying cause of death

Let's remove Block X7 and see what other underlying causes of suicide.

In [None]:
#suicide_other = suicide.loc[(suicide['ICD_1'] != "X")&(suicide['ICD_2'] != "7")]
#heaptable_other = suicide_other.pivot_table(index = 'ICD_1', columns = 'ICD_2', values = 'mandeath', aggfunc = np.sum)
plt.figure(figsize=(10, 10))
sns.heatmap(data = heattable,cmap='coolwarm', annot=True,fmt='g',mask = heattable == 199 )

### Let's refer to the ICD System, we can see that:
#### 1. There are 12 cases people's underlying cause of death is "Intentional self-harm by jumping from a high place"
#### 2. There are 6 peoples' ucod is "Intentional self-poisoning by and exposure to nonopioid analgesics, antipyretics and antirheumatics"
#### 3. There is 1 person's ucod is "Type 1 Diabetes"
#### 4. There is 1 person's ucod is "Malignant neoplasms"

## 2.2 Trends

In [None]:
heaptable_trends = suicide.pivot_table(index = 'monthdth', columns = 'weekday', values = 'mandeath', aggfunc = np.sum)

plt.figure(figsize=(10, 10))
sns.heatmap(data = heaptable_trends,cmap='YlGnBu',annot = True, fmt="g")


The figure about tells us the suicide case number regarding to Month and weekday. 重灾区在1月的周六，8月的周日

In [None]:
monthly_trends = suicide.pivot_table(index = 'monthdth', columns = 'sex', values = 'mandeath', aggfunc = np.sum)
monthly_trends.plot()
plt.xticks(np.arange(1, 13, step=1))
plt.show()

In [None]:
weekly_trends = suicide.pivot_table(index = 'weekday',columns= 'sex',values = 'mandeath', aggfunc = np.sum)
weekly_trends.plot()
plt.xticks(np.arange(1, 8, step=1))
plt.show()

Everyone Enjoys Friday

## 2.3 Gender & Age Analysis

Age Category Referred to :https://integrisok.com/resources/on-your-health/2015/october/stages-of-life-health-for-every-age

In [None]:
def get_age_group(x):
    if (x["age"] >= 0) &(x["age"] <= 1):
        return "Infant"
    if (x["age"] >= 2) &(x["age"] <= 4):
        return "Toddler"
    if (x["age"] >= 5) &(x["age"] <= 12):
        return "Child"
    if (x["age"] >= 13) &(x["age"] <= 19):
        return "Teen"
    if (x["age"] >= 20) &(x["age"] <= 39):
        return "Adult"
    if (x["age"] >= 40) &(x["age"] <= 59):
        return "Mid-age Adult"
    else:
        return "Senior"
    
suicide.loc[:,"Age Group"] = suicide.apply(get_age_group,axis=1)

In [None]:
heaptable_genderage = suicide.pivot_table(index = 'sex', columns = 'Age Group', values = 'mandeath', aggfunc = np.sum)

plt.figure(figsize=(10, 10))
sns.heatmap(data = heaptable_genderage,cmap='YlGnBu',annot = True, fmt="g")

From the Figure above we can see that male people have larger potential to suicide than female.
More, Adult and Senior are more likely to suicide.

## 2.3 Gender & Martial Status Analysis

In [None]:
heaptable_gendermarstat = suicide.pivot_table(index = 'sex', columns = 'marstat', values = 'mandeath', aggfunc = np.sum)


plt.figure(figsize=(10, 10))


sns.heatmap(data = heaptable_gendermarstat,cmap='YlGnBu',annot = True, fmt="g")

We need to revise this part in order to get the ratio value of the stuff above.

In [None]:
percentage_gendermarstat = heaptable_gendermarstat.copy()
gender_list = ["F","M"]
marstat_list = ["W","M","S","D"]

for sex in gender_list:
    for marstat in marstat_list:
        percentage_gendermarstat.loc[sex,marstat] = percentage_gendermarstat.loc[sex,marstat]/len(suicide.loc[(suicide["sex"] == sex) & (suicide["marstat"] == marstat)])
    

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(data = percentage_gendermarstat,cmap='YlGnBu',annot = True, fmt=".4f")

# As suggested by Giri, here is a figure for representing the ratio of suicide cases over gender and martial status

# Conclusion: Single Male, in the Hell; (and why the HELL are WIDOWED people less likely to commit suicide???)

## 2.4 Gender Race Analysis

In [None]:
heaptable_genderrace = suicide.pivot_table(index = 'sex', columns = 'race', values = 'mandeath', aggfunc = np.sum)

plt.figure(figsize=(10, 10))
sns.heatmap(data = heaptable_genderrace,cmap='YlGnBu',annot = True, fmt="g")

## 2.5 Education Level Analysis

In [None]:
heaptable_educlevel = suicide.pivot_table(index = 'educ2003',columns='sex', values = 'mandeath', aggfunc = np.sum)
heaptable_educlevel.plot()
plt.xticks(np.arange(1, 10, step=1))
plt.show()

### Make a kernel density estimate of the distribution of the target values and interpret the distribution. 


In [None]:
sns.kdeplot(data=suicide["mandeath"])
plt.xlabel("target")
plt.show()

from this distribution, we can see that the target variable follows a skewed distribution with the mean not centered with the median.

### Compute the correlation between each feature and the target and find the 3 most correlated features.


In [None]:
suicide.corr()

In [None]:
correlation = suicide.corrwith(suicide['mandeath'])
correlationmost = correlation.loc[suicide.corrwith(suicide['mandeath'])>0.01]
correlationmost.sort_values()

In [None]:
correlationleast = correlation.loc[suicide.corrwith(suicide['mandeath'])<-0.01]
correlationleast.sort_values()

In [None]:
suicide.columns.tolist()

In [None]:
plt.figure(figsize=(58,58))
sns.heatmap(suicide.corr(),annot=True, cmap='coolwarm',fmt='.2f')

In [None]:
dfhalf = suicide[['mandeath','educ2003','sex','age','placdth',
 'sex_F', 'sex_M', 'marstat_D', 'marstat_M','marstat_S', 'marstat_W',
'race_American Indian', 'race_Black', 'race_Chinese', 'race_Filipino', 'race_Guamanian',
'race_Hawaiian', 'race_Japanese', 'race_Korean', 'race_White','ranum','hispanic','restatus']]
plt.figure(figsize=(20,20))
sns.heatmap(dfhalf.corr(),annot=True, cmap='coolwarm',fmt='.3f')

In [None]:
suicide[['mandeath','educ2003','sex','age','placdth',
 'sex_F', 'sex_M', 'marstat_D', 'marstat_M','marstat_S', 'marstat_W',
'race_American Indian', 'race_Black', 'race_Chinese', 'race_Filipino', 'race_Guamanian',
'race_Hawaiian', 'race_Japanese', 'race_Korean', 'race_White']]

In [None]:
suicide['raceforvis']

In [None]:
suicide["raceforvis"].replace({18: 8, 28: 9, 38: 10,\
                         48:11, 58: 12,68:13,78:14}, inplace=True)
suicide["raceforvis"].describe()

In [None]:
suicide['mandeath'] =  suicide['mandeath'].astype(int)
suicide['mandeath']

In [None]:
plt.hist(suicide['raceforvis'])

In [None]:
plt.hist(suicide['mandeath'])

In [None]:
suicide["age"].describe()

In [None]:
plt.hist(suicide['age'])

In [None]:
plt.hist(suicide['sex'])

In [None]:
plt.hist(suicide['educ2003'])

In [None]:
plt.hist(suicide['marstat'])

In [None]:
plt.hist(suicide['placdth'])

In [None]:
plt.hist(suicide['ranum'])
plt.xticks(np.arange(15,step=1))

In [None]:
plt.hist(suicide['restatus'])
plt.xticks([1,2,3,4])

In [None]:
plt.hist(suicide['hispanic'])

In [None]:
suicide_people=suicide.loc[suicide['mandeath']==1]

In [None]:
plt.hist(suicide_people['sex'])

In [None]:
plt.hist(suicide_people['raceforvis'],log=True)
plt.xticks(np.arange(max(suicide_people['raceforvis']),step=1))

In [None]:
white_suicide_ratio_all = sum(suicide_people['raceforvis']==1)/len(suicide_people)
white_suicide_ratio_all

In [None]:
white_suicide_ratio_suicide = sum(suicide['raceforvis']==1)/len(suicide)
white_suicide_ratio_suicide

In [None]:
plt.hist(suicide_people['educ2003'])
plt.xticks(np.arange(max(suicide_people['educ2003']),step=1))

In [None]:
plt.hist(suicide_people['age'])

In [None]:
plt.hist(suicide_people['marstat'])

In [None]:
plt.hist(suicide_people['placdth'])

In [None]:
plt.hist(suicide_people['ranum'])

In [None]:
plt.hist(suicide_people['restatus'])

In [None]:
plt.hist(suicide_people['hispanic'])