In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies 
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

Read in CSV File

In [3]:
# Source: https://www.kaggle.com/datasets/russellyates88/suicide-rates-overview-1985-to-2016?resource=download
columns = [
    "country","year","sex","age","suicides_no","population","suicides/100k_pop","country_year",
    "HDI_four_year","gdp_per_capita ($)","generation"
]

target = ["year"]

In [4]:
# Load the data
file_path = ('master.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1995,male,25-34 years,13,232900,5.58,Albania1995,0.619,2424499009,835,Generation X
1,Albania,1995,male,55-74 years,9,178000,5.06,Albania1995,0.619,2424499009,835,Silent
2,Albania,1995,female,75+ years,2,40800,4.90,Albania1995,0.619,2424499009,835,G.I. Generation
3,Albania,1995,female,15-24 years,13,283500,4.59,Albania1995,0.619,2424499009,835,Generation X
4,Albania,1995,male,15-24 years,11,241200,4.56,Albania1995,0.619,2424499009,835,Generation X
...,...,...,...,...,...,...,...,...,...,...,...,...
8359,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
8360,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
8361,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
8362,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


Week 2 Changes: Filtering DataFrame to get Specific Countries

In [5]:
# Week 2: Filter DataFrame to show first world countries 
first_world_countries = ['United States', 'United Kingdom']
df[df["country"].isin(first_world_countries)]

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
7932,United Kingdom,1985,male,75+ years,264,1202838,21.95,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7933,United Kingdom,1985,male,55-74 years,915,5170113,17.70,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7934,United Kingdom,1985,male,35-54 years,1208,6899879,17.51,United Kingdom1985,0.753,489285164271,9231,Silent
7935,United Kingdom,1985,male,25-34 years,620,3969689,15.62,United Kingdom1985,0.753,489285164271,9231,Boomers
7936,United Kingdom,1985,female,55-74 years,678,6002096,11.30,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
...,...,...,...,...,...,...,...,...,...,...,...,...
8167,United States,2014,female,25-34 years,1347,21250636,6.34,United States2014,0.915,17427609000000,58531,Millenials
8168,United States,2014,female,15-24 years,990,21691057,4.56,United States2014,0.915,17427609000000,58531,Millenials
8169,United States,2014,female,75+ years,477,11616299,4.11,United States2014,0.915,17427609000000,58531,Silent
8170,United States,2014,male,5-14 years,277,21264881,1.30,United States2014,0.915,17427609000000,58531,Generation Z


Split the Data into Training and Testing

In [6]:
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object

In [7]:
print(df.columns.tolist())

['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']


In [8]:
# Create our features
# Week 2: Create X and y from the filtered df
X = df[["suicides_no"]]

# Create our target
y = df["age"]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(6273, 1)

Attempt Using Easy Ensemble AdaBoost Classifier

In [10]:
# Resample the training data with the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
suicides_model = EasyEnsembleClassifier(n_estimators = 100, random_state=1)
suicides_model

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [11]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)
y_pred = suicides_model.predict(X_test)

In [12]:
# Check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.27052388323507776

In [13]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[138,   0,  66,  80,  65,   0],
       [144,   0,  72,  75,  57,   0],
       [113,   0,  63,  63, 109,   0],
       [ 80,   0,   6, 262,   1,   0],
       [101,   0,  58,  86, 103,   0],
       [117,   0,  50, 132,  50,   0]])

In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

15-24 years       0.20      0.40      0.68      0.26      0.52      0.26       349
25-34 years       0.00      0.00      1.00      0.00      0.00      0.00       348
35-54 years       0.20      0.18      0.86      0.19      0.39      0.14       348
 5-14 years       0.38      0.75      0.75      0.50      0.75      0.56       349
55-74 years       0.27      0.30      0.84      0.28      0.50      0.23       348
  75+ years       0.00      0.00      1.00      0.00      0.00      0.00       349

avg / total       0.17      0.27      0.85      0.21      0.36      0.20      2091



Attempt using BalancedRandomForestClassifier

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [16]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)

In [17]:
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2767828716969118

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 64,  82,  36,  92,  42,  33],
       [ 60,  79,  43,  84,  51,  31],
       [ 47,  86,  65,  70,  40,  40],
       [ 21,  31,   7, 270,   9,  11],
       [ 54,  54,  53,  95,  59,  33],
       [ 43,  59,  42, 138,  25,  42]])

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

15-24 years       0.22      0.18      0.87      0.20      0.40      0.15       349
25-34 years       0.20      0.23      0.82      0.21      0.43      0.18       348
35-54 years       0.26      0.19      0.90      0.22      0.41      0.16       348
 5-14 years       0.36      0.77      0.73      0.49      0.75      0.56       349
55-74 years       0.26      0.17      0.90      0.21      0.39      0.14       348
  75+ years       0.22      0.12      0.92      0.16      0.33      0.10       349

avg / total       0.26      0.28      0.86      0.25      0.45      0.21      2091



Attempting Combination of Over and UnderSampling the Data using SMOTEEN

In [20]:
# Using SMOTEEN
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [21]:
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.19319566577742645

In [23]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[  0,   0, 158,   0,   0, 191],
       [  0,   0, 170,   0,   0, 178],
       [  0,   0, 192,   0,   0, 156],
       [  0,   0,  16,   0,   0, 333],
       [  0,   0, 182,   0,   0, 166],
       [  0,   0, 137,   0,   0, 212]])

In [24]:
# Classification Report 
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

15-24 years       0.00      0.00      1.00      0.00      0.00      0.00       349
25-34 years       0.00      0.00      1.00      0.00      0.00      0.00       348
35-54 years       0.22      0.55      0.62      0.32      0.58      0.34       348
 5-14 years       0.00      0.00      1.00      0.00      0.00      0.00       349
55-74 years       0.00      0.00      1.00      0.00      0.00      0.00       348
  75+ years       0.17      0.61      0.41      0.27      0.50      0.26       349

avg / total       0.07      0.19      0.84      0.10      0.18      0.10      2091



Attempt OverSampling with RandomOverSampler

In [25]:
# Using RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'5-14 years': 1046,
         '35-54 years': 1046,
         '25-34 years': 1046,
         '15-24 years': 1046,
         '75+ years': 1046,
         '55-74 years': 1046})

In [26]:
# Train the Logistic Regression model using the resampled data
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [27]:
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.2637873398544281

In [28]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 12,  15,  49, 153,  12, 108],
       [ 15,  17,  41, 135,  13, 127],
       [ 20,  23,  83, 119,  19,  84],
       [  0,   1,   0, 314,   1,  33],
       [ 12,  29,  60, 133,  29,  85],
       [  9,  17,  31, 183,  12,  97]])

In [29]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

15-24 years       0.18      0.03      0.97      0.06      0.18      0.03       349
25-34 years       0.17      0.05      0.95      0.08      0.22      0.04       348
35-54 years       0.31      0.24      0.90      0.27      0.46      0.20       348
 5-14 years       0.30      0.90      0.58      0.45      0.73      0.54       349
55-74 years       0.34      0.08      0.97      0.13      0.28      0.07       348
  75+ years       0.18      0.28      0.75      0.22      0.46      0.20       349

avg / total       0.25      0.26      0.85      0.20      0.39      0.18      2091

