## 0. Load libraries/dataset and clean dataset

#### Load

In [1]:
# Load libraries
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os.path import isfile
from itertools import product

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_absolute_error as sk_mae, mean_squared_error as sk_mse
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

sns.set(style='ticks', color_codes=True)

# Load dataset
data_path = './datasets/Suicide_Rates.csv'

suicides = pd.DataFrame()
if not isfile(data_path):
    print("Dataset not found. Please check that the dataset exists and the path is correct.")
    
else:
    suicides = pd.read_csv(data_path)
    
display(suicides)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
...,...,...,...,...,...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
27816,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
27818,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


#### Clean

In [2]:
# Adjust dataset to taste    
# GDP per year has a weird clerical error and is recorded as strings instead of ints
suicides = suicides.rename(columns={' gdp_for_year ($) ': 'gdp_for_year ($)'})

suicides['gdp_for_year ($)'] = suicides['gdp_for_year ($)'].str.replace(',', '')
suicides['gdp_for_year ($)'] = suicides['gdp_for_year ($)'].apply(int)

# Lowercase HDI for uniformity
suicides = suicides.rename(columns={'HDI for year': 'hdi for year'})

# Define the target feat
target_feat = 'suicides/100k pop'

In [3]:
# Impute HDI
hdi_mean = suicides['hdi for year'].mean()
suicides['hdi for year'] = suicides['hdi for year'].fillna(hdi_mean)

# Drop extraneous features
extra_feats = ['country', 'suicides_no', 'population', 'country-year',
               'gdp_for_year ($)', 'gdp_per_capita ($)', 'year', 'hdi for year']
suicides.drop(columns=extra_feats, inplace=True)

# Reorganize columns alphabetically
reorg = lambda df: df.reindex(sorted(df.columns), axis=1)
suicides = reorg(suicides)

display(suicides)

Unnamed: 0,age,generation,sex,suicides/100k pop
0,15-24 years,Generation X,male,6.71
1,35-54 years,Silent,male,5.19
2,15-24 years,Generation X,female,4.83
3,75+ years,G.I. Generation,male,4.59
4,25-34 years,Boomers,male,3.28
...,...,...,...,...
27815,35-54 years,Generation X,female,2.96
27816,75+ years,Silent,female,2.58
27817,5-14 years,Generation Z,male,2.17
27818,5-14 years,Generation Z,female,1.67


In [4]:
# Create test df
test_df = pd.DataFrame(data=[['15-24 years', 'male', 'Generation X']], columns=['age', 'sex', 'generation'])
test_df = reorg(test_df)

display(test_df)

Unnamed: 0,age,generation,sex
0,15-24 years,Generation X,male


## 1. Perform multiple linear regression with one-hot encoded variables.

__*Summary of Results*__

The number of linear coefficients is 13 (the number of the independent variables).

The predicted suicide rate for 20-year old, Generation-X males is around 16.76 / 100k pop.

The MAE for this prediction is 14.32.

In [5]:
# Copy the dataframes
suic_oh = suicides.copy()
test_oh = test_df.copy()

# I did this the long way around, but at least I can just directly build the 
# test set from a test df (which we will be doing for question 4)
def onehot_encode(data_df, test_df, extra_classes={}):
    for feat in data_df.loc[:, data_df.columns != target_feat]:
        lb = LabelBinarizer()

        # fit the binarizer
        lb = lb.fit(data_df[feat])
        
        # append any specified classes
        if feat in extra_classes:
                classes = lb.classes_.tolist()
                classes += extra_classes[feat]
                
                lb.classes_ = np.array(classes)

        # get the column names
        cols = lb.classes_
        cols = [feat + ' - ' + col for col in cols]

        # get the transforms
        data_trans = lb.transform(data_df[feat])
        test_trans = lb.transform(test_df[feat])

        # LabelBinarizer doesn't split 'sex' into an array :(
        # I did it with pd.get_dummies and this doesn't seem to effect it too bad (+/- 0.2 on the prediction)
        if feat == 'sex':
            data_df[feat] = data_trans
            test_df[feat] = test_trans
            continue

        # tranform the feat column
        data_trans = pd.DataFrame(data_trans, columns=cols)
        data_df = pd.concat([data_df, data_trans], axis=1)
        data_df.drop(columns=[feat], inplace=True)

        # do the same to the test
        test_trans = pd.DataFrame(test_trans, columns=cols)
        test_df = pd.concat([test_df, test_trans], axis=1)
        test_df.drop(columns=[feat], inplace=True)
        
    return data_df, test_df

# Run the encoder
suic_oh, test_oh = onehot_encode(suic_oh, test_oh)

# Re-sort the dataframes
suic_oh = reorg(suic_oh)
test_oh = reorg(test_oh)

display(suic_oh, test_oh)

Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent,sex,suicides/100k pop
0,1,0,0,0,0,0,0,0,1,0,0,0,1,6.71
1,0,0,1,0,0,0,0,0,0,0,0,1,1,5.19
2,1,0,0,0,0,0,0,0,1,0,0,0,0,4.83
3,0,0,0,0,0,1,0,1,0,0,0,0,1,4.59
4,0,1,0,0,0,0,1,0,0,0,0,0,1,3.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27815,0,0,1,0,0,0,0,0,1,0,0,0,0,2.96
27816,0,0,0,0,0,1,0,0,0,0,0,1,0,2.58
27817,0,0,0,1,0,0,0,0,0,1,0,0,1,2.17
27818,0,0,0,1,0,0,0,0,0,1,0,0,0,1.67


Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent,sex
0,1,0,0,0,0,0,0,0,1,0,0,0,1


#### Some useful functionalization

In [6]:
# Split data
def split_XYdata(df, target=target_feat):
    X = df.loc[:, df.columns != target].values
    Y = df.loc[:, df.columns == target].values
    
    return X, Y

# Mean Absolute Error (from module07 notebook)
def mae(_y, _y_pred):
    return (len(_y)**-1) * np.sum(np.abs(_y_pred-_y))

# Cross validation alias
sk_mae = make_scorer(sk_mae)
sk_mse = make_scorer(sk_mse)
scorers = {'sk_mae': sk_mae, 'sk_mse': sk_mse}
cross_val = lambda _model, x, y: cross_validate(_model, x, y, n_jobs=4, scoring=scorers)

model = LinearRegression(n_jobs=4, normalize=True)

### Results

In [7]:
# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suic_oh)

model.fit(X, Y)

y_pred = model.predict(test_oh)

In [8]:
# Reduce code reuse
def display_results(model, X, Y, y_pred):
    # Show the predicted suicide rate
    print(f"Predicted Suicide Rate: {y_pred[0][0]}\n")
    
    # Check MAE
    print(f"MAE: {mae(Y, y_pred)}\n")
    
    # Perform Cross Validation
    this_cv = cross_val(model, X, Y)
    print(f"Cross-validation MAE scores:\n{this_cv['test_sk_mae'].tolist()}\n")
    print(f"Cross-validation MSE scores:\n{this_cv['test_sk_mse'].tolist()}\n")
    
    # Check number of and values of coefficients
    print(f"There are {len(model.coef_[0])} coefficients:")
    print(model.coef_[0])

In [9]:
display_results(model, X, Y, y_pred)

Predicted Suicide Rate: 16.765625

MAE: 14.317554322429906

Cross-validation MAE scores:
[10.452355235666786, 9.921695360941992, 11.73944753661934, 9.80943518040079, 9.321210207359814]

Cross-validation MSE scores:
[234.42060520667616, 210.50397712985952, 360.09975953248477, 250.38441121403017, 227.32309670034996]

There are 13 coefficients:
[9.18213205e+13 9.18213205e+13 9.18213205e+13 9.18213205e+13
 9.18213205e+13 9.18213205e+13 1.98042970e+13 1.98042970e+13
 1.98042970e+13 1.98042970e+13 1.98042970e+13 1.98042970e+13
 1.48495270e+01]


## 2. Perform Multiple Linear Regression with numerical variables.

__*Summary of Results*__

The number of linear coefficients is 3 (the number of the independent variables).

The predicted suicide rate for 20-year old, Generation-X males is around 15.18 / 100k pop.

The MAE for this prediction is 13.55.

In [10]:
# Make a new dataframe
suic_num = suicides.copy()
test_num = test_df.copy()

# Label Encode dataset and test val
def label_encode(data_df, test_df, extra_classes={}):
    for feat in data_df.loc[:, data_df.columns != target_feat]:
        le = LabelEncoder()

        le = le.fit(data_df[feat])
        
        # append any specified classes
        if feat in extra_classes:
            classes = le.classes_.tolist() 
            classes += extra_classes[feat]
                               
            le.classes_ = np.array(classes)

        data_df[feat] = le.transform(data_df[feat])
        test_df[feat] = le.transform(test_df[feat])
        
    return data_df, test_df

suic_num, test_num = label_encode(suic_num, test_num)

    
# Re-sort the dataframes
suic_num = reorg(suic_num)
test_num = reorg(test_num)

display(suic_num, test_num)

Unnamed: 0,age,generation,sex,suicides/100k pop
0,0,2,1,6.71
1,2,5,1,5.19
2,0,2,0,4.83
3,5,1,1,4.59
4,1,0,1,3.28
...,...,...,...,...
27815,2,2,0,2.96
27816,5,5,0,2.58
27817,3,3,1,2.17
27818,3,3,0,1.67


Unnamed: 0,age,generation,sex
0,0,2,1


In [11]:
# Split the data, fit with all, predict on test
X, Y = split_XYdata(suic_num)

model.fit(X, Y)

y_pred = model.predict(test_num)

### Results

In [12]:
display_results(model, X, Y, y_pred)

Predicted Suicide Rate: 15.178500202776634

MAE: 13.554067562029791

Cross-validation MAE scores:
[10.85411703495614, 10.56279668035128, 12.7309240899968, 10.30858523429911, 10.323692734594223]

Cross-validation MSE scores:
[256.5655901237479, 243.5716118277855, 402.42963947153413, 281.90917219317447, 271.32204060998185]

There are 3 coefficients:
[ 2.30253066 -0.98903628 14.84646226]


## 3. Are there any changes between the two models?

There is very little change between the models. Label encoding the dataset does seems to produce closer results based on MAE scores.

## 4. What is the prediction for age 33, male and generation Alpha?

Linear regression will not predict with new categorical data, as demonstrated in cells 13 and 14 below (both predict rates of -20790631255667.82, which likely means this is set by sk-learn).

Cells 16 and 17 show that our Linear Model will predict expectable rates based on just 'age' and 'sex' alone (i.e. ignore 'generation').

### Getting Ready

In [13]:
# define the extra class
extra = {'generation': ['Alpha']}

# Create new test df
test_df2 = pd.DataFrame(data=[['25-34 years', 'male', 'Alpha']], columns=['age', 'sex', 'generation'])
test_df2 = reorg(test_df2)
display(test_df2)

Unnamed: 0,age,generation,sex
0,25-34 years,Alpha,male


### One-hot Encoded Linear Regression

In [14]:
# Create new onehot encoded dataset
suic_oh2 = suicides.copy()
test_oh2 = test_df2.copy()

suic_oh2, test_oh2 = onehot_encode(suic_oh2, test_oh2, extra_classes=extra)

suic_oh2 = reorg(suic_oh2)
test_oh2 = reorg(test_oh2)

display(suic_oh2, test_oh2)

# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suic_oh2)

model.fit(X, Y)

y_pred = model.predict(test_oh2)

display_results(model, X, Y, y_pred)

Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Alpha,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent,sex,suicides/100k pop
0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,6.71
1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,5.19
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,4.83
3,0,0,0,0,0,1,0,0,1,0,0,0,0,1,4.59
4,0,1,0,0,0,0,0,1,0,0,0,0,0,1,3.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27815,0,0,1,0,0,0,0,0,0,1,0,0,0,0,2.96
27816,0,0,0,0,0,1,0,0,0,0,0,0,1,0,2.58
27817,0,0,0,1,0,0,0,0,0,0,1,0,0,1,2.17
27818,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1.67


Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,generation - Alpha,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent,sex
0,0,1,0,0,0,0,1,0,0,0,0,0,0,1


Predicted Suicide Rate: -20790631255667.82

MAE: 20790631255680.637

Cross-validation MAE scores:
[10.451560899308053, 9.931224613587348, 11.738138872663551, 9.826889771297626, 9.334378875359455]

Cross-validation MSE scores:
[234.37607490870505, 210.46454683271855, 359.9607863521266, 250.39149267755394, 227.15159860269642]

There are 14 coefficients:
[ 7.19028423e+12  7.19028423e+12  7.19028423e+12  7.19028423e+12
  7.19028423e+12  7.19028423e+12 -3.88342771e+11  2.04022885e+13
  2.04022885e+13  2.04022885e+13  2.04022885e+13  2.04022885e+13
  2.04022885e+13  1.48492191e+01]


### Numerically Encoded Linear Regression

In [15]:
# Create new numerical dataset
suic_num2 = suicides.copy()
test_num2 = test_df2.copy()

suic_num2, test_num2 = label_encode(suic_num2, test_num2, extra_classes=extra)

display(suic_num2, test_num2)

# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suic_oh2)

model.fit(X, Y)

y_pred = model.predict(test_oh2)

display_results(model, X, Y, y_pred)

Unnamed: 0,age,generation,sex,suicides/100k pop
0,0,2,1,6.71
1,2,5,1,5.19
2,0,2,0,4.83
3,5,1,1,4.59
4,1,0,1,3.28
...,...,...,...,...
27815,2,2,0,2.96
27816,5,5,0,2.58
27817,3,3,1,2.17
27818,3,3,0,1.67


Unnamed: 0,age,generation,sex
0,1,6,1


Predicted Suicide Rate: -20790631255667.82

MAE: 20790631255680.637

Cross-validation MAE scores:
[10.451560899308053, 9.931224613587348, 11.738138872663551, 9.826889771297626, 9.334378875359455]

Cross-validation MSE scores:
[234.37607490870505, 210.46454683271855, 359.9607863521266, 250.39149267755394, 227.15159860269642]

There are 14 coefficients:
[ 7.19028423e+12  7.19028423e+12  7.19028423e+12  7.19028423e+12
  7.19028423e+12  7.19028423e+12 -3.88342771e+11  2.04022885e+13
  2.04022885e+13  2.04022885e+13  2.04022885e+13  2.04022885e+13
  2.04022885e+13  1.48492191e+01]


#### How about without 'generation'

Just for our edification how does the Linear Regressor fair without the generational data?

In [16]:
test_df3 = test_df2.drop(columns=['generation'])

### One-hot Encoded Linear Regression w/o 'generation'

In [17]:
# Create new onehot encoded dataset
suic_oh3 = suicides.copy()
test_oh3 = test_df2.copy()

suic_oh3 = suic_oh3.drop(columns=['generation'])
test_oh3 = test_oh3.drop(columns=['generation'])

suic_oh3, test_oh3 = onehot_encode(suic_oh3, test_oh3)

suic_oh3 = reorg(suic_oh3)
test_oh3 = reorg(test_oh3)

display(suic_oh3, test_oh3)

# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suic_oh3)

model.fit(X, Y)

y_pred = model.predict(test_oh3)

display_results(model, X, Y, y_pred)

Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,sex,suicides/100k pop
0,1,0,0,0,0,0,1,6.71
1,0,0,1,0,0,0,1,5.19
2,1,0,0,0,0,0,0,4.83
3,0,0,0,0,0,1,1,4.59
4,0,1,0,0,0,0,1,3.28
...,...,...,...,...,...,...,...,...
27815,0,0,1,0,0,0,0,2.96
27816,0,0,0,0,0,1,0,2.58
27817,0,0,0,1,0,0,1,2.17
27818,0,0,0,1,0,0,0,1.67


Unnamed: 0,age - 15-24 years,age - 25-34 years,age - 35-54 years,age - 5-14 years,age - 55-74 years,age - 75+ years,sex
0,0,1,0,0,0,0,1


Predicted Suicide Rate: 19.671875

MAE: 15.895074361969806

Cross-validation MAE scores:
[10.458139244754223, 9.943023819419482, 11.695219266714593, 9.83682667033609, 9.351421470614666]

Cross-validation MSE scores:
[235.476449592175, 211.49690497616646, 358.8651633713657, 248.56652966843478, 228.7557772960715]

There are 7 coefficients:
[1.15584344e+14 1.15584344e+14 1.15584344e+14 1.15584344e+14
 1.15584344e+14 1.15584344e+14 1.48485249e+01]


### Numerically Encoded Linear Regression w/o 'generation'

In [18]:
# Create new numerical encoded dataset
suic_num3 = suicides.copy()
test_num3 = test_df2.copy()

suic_num3 = suic_num3.drop(columns=['generation'])
test_num3 = test_num3.drop(columns=['generation'])

suic_num3, test_num3 = label_encode(suic_num3, test_num3)

suic_num3 = reorg(suic_num3)
test_num3 = reorg(test_num3)

display(suic_num3, test_num3)

# Split the data, fit with all, predict on test
X, Y  = split_XYdata(suic_num3)

model.fit(X, Y)

y_pred = model.predict(test_num3)

display_results(model, X, Y, y_pred)

Unnamed: 0,age,sex,suicides/100k pop
0,0,1,6.71
1,2,1,5.19
2,0,0,4.83
3,5,1,4.59
4,1,1,3.28
...,...,...,...
27815,2,0,2.96
27816,5,0,2.58
27817,3,1,2.17
27818,3,0,1.67


Unnamed: 0,age,sex
0,1,1


Predicted Suicide Rate: 17.124328943360606

MAE: 14.500070329772942

Cross-validation MAE scores:
[10.84864768493618, 10.53896889549874, 12.767071459570394, 10.350638786112022, 10.401754219295547]

Cross-validation MSE scores:
[260.2327726651328, 246.34643876177395, 406.54625096172447, 281.74873339952575, 275.62303000235175]

There are 2 coefficients:
[ 2.07746293 14.84646226]


## 5. Give one advantage when using regression in terms of input data features.




## 6. Give one advantage when using regular numerical values rather than one-hot encoding for regression.

One advantage of numerical encoding is that inputing a new datapoint to predict off of is easier. In cell 5, I defined a complicated method for one-hot encoding the dataset and test, such that I could avoid having to do the nitty-gritty work of creating a testable array of binary values. The method I defined allows us to create a dataframe with just the values we want to test with in an easy, human-readable format. In cell 10, I did the same thing for label encoding and it was both much easier to do and much shorter code-wise.

## 7. Would you recommend the customer a classifier or regressor for this problem?