In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies 
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

Read in CSV File

In [3]:
# Source: https://www.kaggle.com/datasets/russellyates88/suicide-rates-overview-1985-to-2016?resource=download
columns = [
    "country","year","sex","age","suicides_no","population","suicides/100k_pop","country_year",
    "HDI_four_year","gdp_per_capita ($)","generation"
]

target = ["year"]

In [4]:
# Load the data
file_path = ('master.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1995,male,25-34 years,13,232900,5.58,Albania1995,0.619,2424499009,835,Generation X
1,Albania,1995,male,55-74 years,9,178000,5.06,Albania1995,0.619,2424499009,835,Silent
2,Albania,1995,female,75+ years,2,40800,4.90,Albania1995,0.619,2424499009,835,G.I. Generation
3,Albania,1995,female,15-24 years,13,283500,4.59,Albania1995,0.619,2424499009,835,Generation X
4,Albania,1995,male,15-24 years,11,241200,4.56,Albania1995,0.619,2424499009,835,Generation X
...,...,...,...,...,...,...,...,...,...,...,...,...
8359,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
8360,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
8361,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
8362,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


Split the Data into Training and Testing

In [5]:
# Create our features
X = df[["suicides_no"]]

# Create our target
y = df["year"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(6273, 1)

Using Easy Ensemble AdaBoost Classifier

In [7]:
# Resample the training data with the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
suicides_model = EasyEnsembleClassifier(n_estimators = 100, random_state=1)
suicides_model

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [8]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)
y_pred = suicides_model.predict(X_test)

In [9]:
# Check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.08685727201516674

In [10]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  1,  15,  41,   7,  18,   7,   0,  22,   0,   0],
       [  5,  17,  68,  10,  20,  12,   0,  13,   5,   0],
       [  7,  21,  66,  13,  25,  19,   0,  32,   6,   0],
       [  6,  28, 100,   7,  39,  13,   0,  31,   4,   0],
       [  4,  16,  95,  16,  34,  18,   0,  37,   7,   1],
       [  9,  24,  93,  18,  51,  17,   0,  34,   6,   0],
       [  8,  20,  93,  16,  42,  20,   0,  37,   7,   0],
       [  8,  18,  97,  10,  37,  25,   0,  32,   7,   0],
       [  6,  21,  92,  10,  47,  20,   0,  32,   3,   0],
       [  7,  14,  93,  16,  38,  24,   0,  26,   7,   0]])

In [11]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       1985       0.02      0.01      0.97      0.01      0.09      0.01       111
       1990       0.09      0.11      0.91      0.10      0.32      0.09       150
       1995       0.08      0.35      0.59      0.13      0.46      0.20       189
       2000       0.06      0.03      0.94      0.04      0.17      0.03       228
       2005       0.10      0.15      0.83      0.12      0.35      0.12       228
       2010       0.10      0.07      0.91      0.08      0.25      0.06       252
       2011       0.00      0.00      1.00      0.00      0.00      0.00       243
       2012       0.11      0.14      0.86      0.12      0.34      0.11       234
       2013       0.06      0.01      0.97      0.02      0.11      0.01       231
       2014       0.00      0.00      1.00      0.00      0.00      0.00       225

avg / total       0.06      0.08      0.90      0.06      0.21      0.06      2091

