## 0. Load dataset/libraries and clean

In [1]:
# Load libraries and dataset
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os.path import isfile
from itertools import product

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

sns.set(style='ticks', color_codes=True)

data_path = './datasets/Suicide_Rates.csv'

suicides = pd.DataFrame()
if not isfile(data_path):
    print("Dataset not found. Please check that the dataset exists and the path is correct.")
    
else:
    suicides = pd.read_csv(data_path)
    
    
# GDP per year is recorded as strings and not ints so fix that real quick
suicides = suicides.rename(columns={' gdp_for_year ($) ': 'gdp_for_year ($)'})

suicides['gdp_for_year ($)'] = suicides['gdp_for_year ($)'].str.replace(',', '')
suicides['gdp_for_year ($)'] = suicides['gdp_for_year ($)'].apply(int)

# I want things alphabetical later (for my own sanity), so let's lowercase HDI
suicides = suicides.rename(columns={'HDI for year': 'hdi for year'})

suicides_org = suicides.copy()

display(suicides)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,hdi for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
...,...,...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


In [2]:
# Impute selected feat to mean
def mean_impute(df, feat):
    if not type(feat) == list:
        this_mean = df[feat].mean()
        df[feat] = df[feat].fillna(this_mean)
        
    else:
        for f in feat:
            df = mean_impute(df, f)
    
    return df

# One-hot encode the feat
def encode_onehot(df, feat):
    if not type(feat) == list:
        _df = pd.get_dummies(df[feat], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(feat + ' - ')
        df = pd.concat([df, _df], axis=1)
        df = df.drop([feat], axis=1)
        
    else:
        for f in feat:
            df = encode_onehot(df, f)

    return df

# Drop extra columns
def drop_extra(df, feat):
    if not type(feat) == list:
        feat = [feat]
        
    df  = df.drop(columns=feat)
    return df

# Reorganize columns alphabetically
def reorganize(df):
    return df.reindex(sorted(df.columns), axis=1)

# Apply data cleaning measures
mean_imputable = ['hdi for year']
suicides = mean_impute(suicides, mean_imputable)

one_hotable = ['sex', 'age', 'generation']
suicides = encode_onehot(suicides, one_hotable)

extra_feats = ['country', 'suicides_no', 'population', 'country-year',
               'gdp_for_year ($)', 'gdp_per_capita ($)', 'hdi for year', 'year']
suicides = drop_extra(suicides, extra_feats)

suicides = reorganize(suicides)

In [3]:
display(suicides.head())

Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent,sex - female,sex - male,suicides/100k pop
0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,6.71
1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,5.19
2,1,0,0,0,0,0,0,0,1,0,0,0,1,0,4.83
3,0,0,0,0,0,1,0,1,0,0,0,0,0,1,4.59
4,0,1,0,0,0,0,1,0,0,0,0,0,0,1,3.28


In [4]:
# Split data
def split_XYdata(df, target='suicides/100k pop'):
    X = suicides.loc[:, df.columns != target].values
    Y = suicides.loc[:, df.columns == target].values
    
    return X, Y

# Mean Squared Error
def mse(y, y_pred):
    return (1/len(y)) * np.sum((y_pred - y)**2)

model = LinearRegression(n_jobs=4)

test = np.array([[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]])


## 1. Perform multiple linear regression with one-hot encoded variables.

In [5]:
# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suicides)
model = LinearRegression()
model.fit(X, Y)

y_pred = model.predict(test)

In [6]:
# Show the predicted suicide rate for male 20 year old Generation X'ers
print(f"Suicide rate for 20 year old, male, Generation X: {y_pred[0][0]}")

Suicide rate for 20 year old, male, Generation X: 16.96875


Seems a tad high.

In [7]:
mse(Y, y_pred)

376.77049971400976

![Joey](https://media.giphy.com/media/11ykUODgXjAXZu/giphy.gif "Joey")

. . . or *really* high