In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Week 1: Creating a mock-up of the model
# Import dependencies 
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

Read in CSV File

In [3]:
# Week 1: Loading in data
# Source: https://www.kaggle.com/datasets/russellyates88/suicide-rates-overview-1985-to-2016?resource=download
columns = [
    "country","year","sex","age","suicides_no","population","suicides/100k_pop","country_year",
    "HDI_four_year","gdp_per_capita ($)","generation"
]

target = ["generation"]

In [4]:
# Week 2: Fine tune the model. Beginning Preprocessing
# Week 1: Load the data
file_path = ('master.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1995,male,25-34 years,13,232900,5.58,Albania1995,0.619,2424499009,835,Generation X
1,Albania,1995,male,55-74 years,9,178000,5.06,Albania1995,0.619,2424499009,835,Silent
2,Albania,1995,female,75+ years,2,40800,4.90,Albania1995,0.619,2424499009,835,G.I. Generation
3,Albania,1995,female,15-24 years,13,283500,4.59,Albania1995,0.619,2424499009,835,Generation X
4,Albania,1995,male,15-24 years,11,241200,4.56,Albania1995,0.619,2424499009,835,Generation X
...,...,...,...,...,...,...,...,...,...,...,...,...
8359,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
8360,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
8361,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
8362,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


In [5]:
# Week 2: Preprocessing by finding null values 
for column in df.columns:
    print(f"Columns {column} has {df[column].isnull().sum()} null values")

Columns country has 0 null values
Columns year has 0 null values
Columns sex has 0 null values
Columns age has 0 null values
Columns suicides_no has 0 null values
Columns population has 0 null values
Columns suicides/100k pop has 0 null values
Columns country-year has 0 null values
Columns HDI for year has 0 null values
Columns  gdp_for_year ($)  has 0 null values
Columns gdp_per_capita ($) has 0 null values
Columns generation has 0 null values


In [6]:
# Week 2: Preprocessing by find duplicate entries 
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Week 2 Changes: Filtering DataFrame to get Specific Countries

In [7]:
# Week 2: Filter DataFrame to show first world countries 
first_world_countries = ['United States', 'United Kingdom']
df[df["country"].isin(first_world_countries)]

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
7932,United Kingdom,1985,male,75+ years,264,1202838,21.95,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7933,United Kingdom,1985,male,55-74 years,915,5170113,17.70,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7934,United Kingdom,1985,male,35-54 years,1208,6899879,17.51,United Kingdom1985,0.753,489285164271,9231,Silent
7935,United Kingdom,1985,male,25-34 years,620,3969689,15.62,United Kingdom1985,0.753,489285164271,9231,Boomers
7936,United Kingdom,1985,female,55-74 years,678,6002096,11.30,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
...,...,...,...,...,...,...,...,...,...,...,...,...
8167,United States,2014,female,25-34 years,1347,21250636,6.34,United States2014,0.915,17427609000000,58531,Millenials
8168,United States,2014,female,15-24 years,990,21691057,4.56,United States2014,0.915,17427609000000,58531,Millenials
8169,United States,2014,female,75+ years,477,11616299,4.11,United States2014,0.915,17427609000000,58531,Silent
8170,United States,2014,male,5-14 years,277,21264881,1.30,United States2014,0.915,17427609000000,58531,Generation Z


# Split the Data into Training and Testing

In [8]:
# Week 1: Checking data types
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object

In [9]:
# Week 1: Checking Columns
print(df.columns.tolist())

['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']


In [10]:
# Week 2: Create X and y from the filtered df (1st world countries)
X = df[["suicides_no"]]

# Create our target
y = df["sex"]

In [11]:
# Import train, test, split for Preprocess
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(6273, 1)

# Week 1: Attempt Using Easy Ensemble AdaBoost Classifier

In [12]:
# Resample the training data with the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
suicides_model = EasyEnsembleClassifier(n_estimators = 100, random_state=1)
suicides_model

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [13]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)
y_pred = suicides_model.predict(X_test)

In [14]:
# Check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6050408482530854

In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[779, 266],
       [560, 486]])

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.58      0.75      0.46      0.65      0.59      0.36      1045
       male       0.65      0.46      0.75      0.54      0.59      0.34      1046

avg / total       0.61      0.60      0.61      0.60      0.59      0.35      2091



# Week 1: Attempt using BalancedRandomForestClassifier

In [17]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [18]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)

In [19]:
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5639419250368229

In [20]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[801, 244],
       [668, 378]])

In [21]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.55      0.77      0.36      0.64      0.53      0.29      1045
       male       0.61      0.36      0.77      0.45      0.53      0.27      1046

avg / total       0.58      0.56      0.56      0.55      0.53      0.28      2091



# Week 1: Attempting Combination of Over and UnderSampling the Data using SMOTEEN

In [22]:
# Using SMOTEEN
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [23]:
#suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

BalancedRandomForestClassifier(random_state=1)

In [24]:
#y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5639419250368229

In [25]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[801, 244],
       [668, 378]])

In [26]:
# Classification Report 
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.55      0.77      0.36      0.64      0.53      0.29      1045
       male       0.61      0.36      0.77      0.45      0.53      0.27      1046

avg / total       0.58      0.56      0.56      0.55      0.53      0.28      2091



# Week 1: Attempt OverSampling with RandomOverSampler

In [27]:
# Using RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'male': 3137, 'female': 3137})

In [28]:
# Train the Logistic Regression model using the resampled data
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5898149249361889

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[933, 112],
       [746, 300]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.56      0.89      0.29      0.69      0.51      0.27      1045
       male       0.73      0.29      0.89      0.41      0.51      0.24      1046

avg / total       0.64      0.59      0.59      0.55      0.51      0.26      2091



# Week 2: Comparison of rates between 1st and 3rd world countries

In [32]:
# Week 2: Filter DataFrame to show first world countries 
third_world_countries = ['United States', 'Jamaica']
df[df["country"].isin(third_world_countries)]

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
4044,Jamaica,1985,male,75+ years,1,29000,3.45,Jamaica1985,0.650,2100223150,1032,G.I. Generation
4045,Jamaica,1985,female,75+ years,1,38000,2.63,Jamaica1985,0.650,2100223150,1032,G.I. Generation
4046,Jamaica,1985,male,15-24 years,3,259000,1.16,Jamaica1985,0.650,2100223150,1032,Generation X
4047,Jamaica,1985,male,25-34 years,1,156000,0.64,Jamaica1985,0.650,2100223150,1032,Boomers
4048,Jamaica,1985,male,35-54 years,1,169000,0.59,Jamaica1985,0.650,2100223150,1032,Silent
...,...,...,...,...,...,...,...,...,...,...,...,...
8167,United States,2014,female,25-34 years,1347,21250636,6.34,United States2014,0.915,17427609000000,58531,Millenials
8168,United States,2014,female,15-24 years,990,21691057,4.56,United States2014,0.915,17427609000000,58531,Millenials
8169,United States,2014,female,75+ years,477,11616299,4.11,United States2014,0.915,17427609000000,58531,Silent
8170,United States,2014,male,5-14 years,277,21264881,1.30,United States2014,0.915,17427609000000,58531,Generation Z


In [33]:
# Week 2: Create X and y from the filtered df (1st and 3rd world countries)
X = df[["suicides_no"]]

# Create our target
y = df["sex"]

In [34]:
# Week 2: import train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(6273, 1)

# Easy Ensemble Classifier

In [35]:
# Week 2: Using Easy Ensemble Classifier for Comaprison
suicide_model = EasyEnsembleClassifier(n_estimators = 100, random_state=1)
suicide_model

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [36]:
# Fit the model
suicide_model = suicide_model.fit(X_train, y_train)
y_pred = suicide_model.predict(X_test)

In [37]:
# Check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6050408482530854

In [38]:
# Print Confusion matrix
confusion_matrix(y_test, y_pred)

array([[779, 266],
       [560, 486]])

In [39]:
# Print classification report 
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.58      0.75      0.46      0.65      0.59      0.36      1045
       male       0.65      0.46      0.75      0.54      0.59      0.34      1046

avg / total       0.61      0.60      0.61      0.60      0.59      0.35      2091

