In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies 
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Week 1: Read in CSV File

In [3]:
# Source: https://www.kaggle.com/datasets/russellyates88/suicide-rates-overview-1985-to-2016?resource=download
# For our project, we agreed that a supervised learning model wold be the best first simply because
# we already have the data provided to us and our answer. In addition, we can look at comparisons 
# when it comes to suicide since we believe there are many factors we can look at in order to find a relationship.
columns = [
    "country","year","sex","age","suicides_no","population","suicides/100k_pop","country_year",
    "HDI_four_year","gdp_per_capita ($)","generation"
]

target = ["sex"]

In [4]:
# Load the data
file_path = ('master.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1995,male,25-34 years,13,232900,5.58,Albania1995,0.619,2424499009,835,Generation X
1,Albania,1995,male,55-74 years,9,178000,5.06,Albania1995,0.619,2424499009,835,Silent
2,Albania,1995,female,75+ years,2,40800,4.90,Albania1995,0.619,2424499009,835,G.I. Generation
3,Albania,1995,female,15-24 years,13,283500,4.59,Albania1995,0.619,2424499009,835,Generation X
4,Albania,1995,male,15-24 years,11,241200,4.56,Albania1995,0.619,2424499009,835,Generation X
...,...,...,...,...,...,...,...,...,...,...,...,...
8359,Uzbekistan,2014,female,35-54 years,107,3620833,2.96,Uzbekistan2014,0.675,63067077179,2309,Generation X
8360,Uzbekistan,2014,female,75+ years,9,348465,2.58,Uzbekistan2014,0.675,63067077179,2309,Silent
8361,Uzbekistan,2014,male,5-14 years,60,2762158,2.17,Uzbekistan2014,0.675,63067077179,2309,Generation Z
8362,Uzbekistan,2014,female,5-14 years,44,2631600,1.67,Uzbekistan2014,0.675,63067077179,2309,Generation Z


In [5]:
# Week 2: Preprocessing Data by first finding null values 
for column in df.columns:
    print(f"Columns {column} has {df[column].isnull().sum()} null values")

Columns country has 0 null values
Columns year has 0 null values
Columns sex has 0 null values
Columns age has 0 null values
Columns suicides_no has 0 null values
Columns population has 0 null values
Columns suicides/100k pop has 0 null values
Columns country-year has 0 null values
Columns HDI for year has 0 null values
Columns  gdp_for_year ($)  has 0 null values
Columns gdp_per_capita ($) has 0 null values
Columns generation has 0 null values


In [6]:
# Week 2: Preprocessing by find duplicate entries 
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Week 2 Changes: Filtering DataFrame to get Specific Countries

In [7]:
# Week 2: Selecting and filtering the DataFrame to create a new DataFrame 
# showing the a pair of first world countries (United States & United Kingdom)
first_world_df = df.loc[7932:8171]
first_world_df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
7932,United Kingdom,1985,male,75+ years,264,1202838,21.95,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7933,United Kingdom,1985,male,55-74 years,915,5170113,17.70,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
7934,United Kingdom,1985,male,35-54 years,1208,6899879,17.51,United Kingdom1985,0.753,489285164271,9231,Silent
7935,United Kingdom,1985,male,25-34 years,620,3969689,15.62,United Kingdom1985,0.753,489285164271,9231,Boomers
7936,United Kingdom,1985,female,55-74 years,678,6002096,11.30,United Kingdom1985,0.753,489285164271,9231,G.I. Generation
...,...,...,...,...,...,...,...,...,...,...,...,...
8167,United States,2014,female,25-34 years,1347,21250636,6.34,United States2014,0.915,17427609000000,58531,Millenials
8168,United States,2014,female,15-24 years,990,21691057,4.56,United States2014,0.915,17427609000000,58531,Millenials
8169,United States,2014,female,75+ years,477,11616299,4.11,United States2014,0.915,17427609000000,58531,Silent
8170,United States,2014,male,5-14 years,277,21264881,1.30,United States2014,0.915,17427609000000,58531,Generation Z


# Split the Data into Training and Testing

In [8]:
# Week 2: Preprocessing by checking data types before starting
df.dtypes

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object

In [9]:
# Listing columns that are a part of our selection
print(df.columns.tolist())

['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', 'HDI for year', ' gdp_for_year ($) ', 'gdp_per_capita ($)', 'generation']


In [10]:
# Week 2: Create X and y variables from the filtered df (first world countries)
# This comparison is made between two first world countries for better accuracy and consistency.
# Here we are checking for a correlation between suicide rates and gender.
X = first_world_df[["suicides_no"]]

# Create the target
y = first_world_df["sex"]

In [11]:
# Import train test split for Preprocess
# We train, test, and split the function before looking for relationships by first
# having our devices split the data into two sections (train and test). 
# Train: We train teh data so our device learns from the data itself.
# Test: Our device assess the performance in order to uncover trends.
# We do this for the 'X' and 'y' variables.
from sklearn.model_selection import train_test_split

# Having a random state of 1 ensures that the same rows are assigned sets of training and testing.
# If we had different numbers for random_state, then different rows would be distrubuted.
# When we use stratify, we divide the data equally.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(180, 1)

# Using BalancedRandomForestClassifier (First World Countries)

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
# Using BalancedRandomForest, multiple trees can be used in order to provide 
# a stronger model with the help of weaker models 
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [13]:
# Fit the model for the X and y training sets
suicides_model = suicides_model.fit(X_train, y_train)

In [14]:
# Here we create predictions and assemble the results into a Pandas DataFrame
# The 'predict' method is used to create predictions based on the X_test.
# Then the model is evaluated  on its performance by checking the accuracy score.
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7166666666666666

In [15]:
# The confusion_matrix here displayed the table of 
# true positives, false positives, true negatives & false negatives
confusion_matrix(y_test, y_pred)

array([[21,  9],
       [ 8, 22]])

In [16]:
# The classification report from sklearn's module's performs the task of getting metrics such as 
# precision, f1 score, and recall score.
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.72      0.70      0.73      0.71      0.72      0.51        30
       male       0.71      0.73      0.70      0.72      0.72      0.52        30

avg / total       0.72      0.72      0.72      0.72      0.72      0.51        60



# Using SMOTEEN (First World Countries)

In [17]:
# Using SMOTEEN, we can perform both over and undersampling. 
# We can first oversample the minority class with SMOTE,
# and then we can use undersampling to clean the resulting data. This is done by
# dropping the two nearest neighbors of data points belonging to two different classes.
#(US & UK)
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [18]:
# Here we use LogisticRegression from the sklearn.library to represent the model
# The solver of 'lbfgs' is the default model. 
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [19]:
# Here we create predictions and assemble the results into a Pandas DataFrame
# The 'predict' method is used to create predictions based on the X_test.
# Then the model is evaluated  on its performance by checking the accuracy score.
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7833333333333333

In [20]:
# The confusion_matrix here displayed the table of 
# true positives, false positives, true negatives & false negatives
confusion_matrix(y_test, y_pred)

array([[27,  3],
       [10, 20]])

In [21]:
# The classification report from sklearn's module's performs the task of getting metrics such as 
# precision, f1 score, and recall score.
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.73      0.90      0.67      0.81      0.77      0.61        30
       male       0.87      0.67      0.90      0.75      0.77      0.59        30

avg / total       0.80      0.78      0.78      0.78      0.77      0.60        60



# Using RandomOverSampler (First World Countries)

In [22]:
# Using RandomOverSampler (US & UK)
# With RandomOverSampler, the instances of the minority classes are randomly selected and 
# added to the training set until both classes are balanced 
from imblearn.over_sampling import RandomOverSampler
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'female': 90, 'male': 90})

In [23]:
# Use LogisticRegression and fit the model so the the performance can be evaluated 
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [24]:
# Predict the model using the X_test
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7333333333333334

In [25]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[28,  2],
       [14, 16]])

In [26]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.67      0.93      0.53      0.78      0.71      0.52        30
       male       0.89      0.53      0.93      0.67      0.71      0.48        30

avg / total       0.78      0.73      0.73      0.72      0.71      0.50        60



# First World Countries and Suicide Rates (Comparison between US and UK)

In [27]:
# Checking for a relationship betwwen rates and two first world countries (US & UK)
# Create features
X = first_world_df[["suicides_no"]]

# Create our target
y = first_world_df["country"]

In [28]:
# Import train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(180, 1)

# Using BalancedRandomForestClassifier

In [29]:
# Resample the training data with the BalancedRandomForestClassifier 
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [30]:
# Fit the model for the X and y training sets
suicides_model = suicides_model.fit(X_train, y_train)

In [31]:
# Here we create predictions and assemble the results into a Pandas DataFrame
# The 'predict' method is used to create predictions based on the X_test.
# Then the model is evaluated  on its performance by checking the accuracy score.
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7166666666666666

In [32]:
# The confusion_matrix here displayed the table of 
# true positives, false positives, true negatives & false negatives
confusion_matrix(y_test, y_pred)

array([[22,  8],
       [ 9, 21]])

In [33]:
# The classification report from sklearn's module's performs the task of getting metrics such as 
# precision, f1 score, and recall score.
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

United Kingdom       0.71      0.73      0.70      0.72      0.72      0.52        30
 United States       0.72      0.70      0.73      0.71      0.72      0.51        30

   avg / total       0.72      0.72      0.72      0.72      0.72      0.51        60



# Using SMOTEEN (1st World Countries)

In [34]:
# Using SMOTEEN, we can perform both over and undersampling. 
# We can first oversample the minority class with SMOTE,
# and then we can use undersampling to clean the resulting data. This is done by
# dropping the two nearest neighbors of data points belonging to two different classes.
#(US & UK)
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [35]:
# Here we use LogisticRegression from the sklearn.library to represent the model
# The solver of 'lbfgs' is the default model. 
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [36]:
# Here we create predictions and assemble the results into a Pandas DataFrame
# The 'predict' method is used to create predictions based on the X_test.
# Then the model is evaluated  on its performance by checking the accuracy score.
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6833333333333333

In [37]:
# The confusion_matrix here displayed the table of 
# true positives, false positives, true negatives & false negatives
confusion_matrix(y_test, y_pred)

array([[25,  5],
       [14, 16]])

In [38]:
# The classification report from sklearn's module's performs the task of getting metrics such as 
# precision, f1 score, and recall score.
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

United Kingdom       0.64      0.83      0.53      0.72      0.67      0.46        30
 United States       0.76      0.53      0.83      0.63      0.67      0.43        30

   avg / total       0.70      0.68      0.68      0.68      0.67      0.44        60



# Using RandomOverSampler (1st World Countries)

In [39]:
# Using RandomOverSampler (US & UK)
# With RandomOverSampler, the instances of the minority classes are randomly selected and 
# added to the training set until both classes are balanced 
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'United Kingdom': 90, 'United States': 90})

In [40]:
# Use LogisticRegression and fit the model
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [41]:
# Predict the model using the X_test
# Calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6833333333333333

In [42]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[24,  6],
       [13, 17]])

In [43]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

United Kingdom       0.65      0.80      0.57      0.72      0.67      0.46        30
 United States       0.74      0.57      0.80      0.64      0.67      0.44        30

   avg / total       0.69      0.68      0.68      0.68      0.67      0.45        60



# Comparison of Suicide Rates between 1st and 3rd world countries

In [44]:
# Week 2: Selecting and filtering the original DataFrame to create a new DataFrame 
# showing the a pair of third world countries (United States & Jamaica)
# This time we are looking for a trend between first and third world countries
third_world_df = df[(df['country'] =='United States') | (df['country'] == 'Jamaica')]
third_world_df

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
4044,Jamaica,1985,male,75+ years,1,29000,3.45,Jamaica1985,0.650,2100223150,1032,G.I. Generation
4045,Jamaica,1985,female,75+ years,1,38000,2.63,Jamaica1985,0.650,2100223150,1032,G.I. Generation
4046,Jamaica,1985,male,15-24 years,3,259000,1.16,Jamaica1985,0.650,2100223150,1032,Generation X
4047,Jamaica,1985,male,25-34 years,1,156000,0.64,Jamaica1985,0.650,2100223150,1032,Boomers
4048,Jamaica,1985,male,35-54 years,1,169000,0.59,Jamaica1985,0.650,2100223150,1032,Silent
...,...,...,...,...,...,...,...,...,...,...,...,...
8167,United States,2014,female,25-34 years,1347,21250636,6.34,United States2014,0.915,17427609000000,58531,Millenials
8168,United States,2014,female,15-24 years,990,21691057,4.56,United States2014,0.915,17427609000000,58531,Millenials
8169,United States,2014,female,75+ years,477,11616299,4.11,United States2014,0.915,17427609000000,58531,Silent
8170,United States,2014,male,5-14 years,277,21264881,1.30,United States2014,0.915,17427609000000,58531,Generation Z


In [45]:
# Week 2: Create X and y from the filtered df (US & Jamaica)
# We are checking for a similar comparison but with a first world and a third wor
X = third_world_df[["suicides_no"]]

# Create our target
y = third_world_df["sex"]

In [46]:
# Week 2: We import train, test, split
# random_state ad stritify are kepy the same to ensure equal rows are set to be trained and tested. 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(144, 1)

# Using RandomOverSampler (1st & 3rd World Countries)

In [47]:
# Using RandomOverSampler for updated DataFrame (US & Jamaica)
from imblearn.over_sampling import RandomOverSampler
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'male': 72, 'female': 72})

In [48]:
# Using the LogisticRegression method and fitting the data
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [49]:
# Use the predict function and calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7291666666666666

In [50]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[22,  2],
       [11, 13]])

In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.67      0.92      0.54      0.77      0.70      0.52        24
       male       0.87      0.54      0.92      0.67      0.70      0.48        24

avg / total       0.77      0.73      0.73      0.72      0.70      0.50        48



# Using Balanced RandomForestClassifier (1st and 3rd World Countries)

In [52]:
# Resample the training data with the BalancedRandomForestClassifier (US & Jamaica)
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [53]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)

In [54]:
# We predict based on X_test and then calculate the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8541666666666667

In [55]:
# Displayed the confusion matrix
confusion_matrix(y_test, y_pred)

array([[21,  3],
       [ 4, 20]])

In [56]:
# Printed the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.84      0.88      0.83      0.86      0.85      0.73        24
       male       0.87      0.83      0.88      0.85      0.85      0.73        24

avg / total       0.85      0.85      0.85      0.85      0.85      0.73        48



# Using SMOTEEN (1st & 3rd World Countries)

In [57]:
# Using SMOTEEN for the new DataFrame (US & Jamaica)
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [58]:
# Like before, we use LogisticRegression and then fit the model
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [59]:
# We make predictions and then check the accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7291666666666666

In [60]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[22,  2],
       [11, 13]])

In [61]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.67      0.92      0.54      0.77      0.70      0.52        24
       male       0.87      0.54      0.92      0.67      0.70      0.48        24

avg / total       0.77      0.73      0.73      0.72      0.70      0.50        48



# Suicide Rates vs 1st and 3rd World Countries 

In [62]:
# Week 2: Create X and y from the filtered df (US & Jamaica)
# We are checking for a comparison but with a first world and a third world country
# What will be different this time, is that we will look at the suicides per 100k population
# because we want the machine to predict which country can be more prone to suicide within
# a "smaller" population.
X = third_world_df[["suicides/100k pop"]]

# Create our target
y = third_world_df["country"]

In [63]:
# Week 2: We import train, test, split
# random_state ad stritify are kepy the same to ensure equal rows are set to be trained and tested. 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(144, 1)

# Using RandomOversampler (US & Jamaica)

In [64]:
# Using RandomOverSampler for updated DataFrame (US & Jamaica)
from imblearn.over_sampling import RandomOverSampler
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'United States': 90, 'Jamaica': 90})

In [65]:
# Using the LogisticRegression method and fitting the data
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [66]:
# Use the predict function and calculated the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8277777777777777

In [67]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[16,  2],
       [ 7, 23]])

In [68]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                     pre       rec       spe        f1       geo       iba       sup

      Jamaica       0.70      0.89      0.77      0.78      0.83      0.69        18
United States       0.92      0.77      0.89      0.84      0.83      0.67        30

  avg / total       0.84      0.81      0.84      0.82      0.83      0.68        48



# Using BalancedRandomForestClassifier    (US & Jamaica)

In [69]:
# Resample the training data with the BalancedRandomForestClassifier (US & Jamaica)
from imblearn.ensemble import BalancedRandomForestClassifier
suicides_model = BalancedRandomForestClassifier(n_estimators = 100, random_state=1)
suicides_model

BalancedRandomForestClassifier(random_state=1)

In [70]:
# Fit the model
suicides_model = suicides_model.fit(X_train, y_train)

In [71]:
# We predict based on X_test and then calculate the balanced accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8444444444444444

In [72]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[16,  2],
       [ 6, 24]])

In [73]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                     pre       rec       spe        f1       geo       iba       sup

      Jamaica       0.73      0.89      0.80      0.80      0.84      0.72        18
United States       0.92      0.80      0.89      0.86      0.84      0.70        30

  avg / total       0.85      0.83      0.86      0.84      0.84      0.71        48



# Using SMOTEEN (US & Jamaica)

In [74]:
# Using SMOTEEN for the new DataFrame (US & Jamaica)
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [75]:
# Like before, we use LogisticRegression and then fit the model
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [76]:
# We make predictions and then check the accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8

In [77]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[15,  3],
       [ 7, 23]])

In [78]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                     pre       rec       spe        f1       geo       iba       sup

      Jamaica       0.68      0.83      0.77      0.75      0.80      0.64        18
United States       0.88      0.77      0.83      0.82      0.80      0.63        30

  avg / total       0.81      0.79      0.81      0.79      0.80      0.64        48



# "Bloopers" (Code with unusual accuracy)

In [79]:
# Despite the following code having low accuracy, 
# we believe it is important to understand 
# why some scores can be very low (or even very high) 
# so we can edit and make our code better.

# Code with a low accuracy

In [80]:
# Here we are creating a new comparison between a first and third world country 
# regarding suicide numbers
X = df[["suicides_no"]]

y = df["sex"]

In [81]:
# Import train, test, split for Preprocess like before
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(6273, 1)

In [82]:
# Using SMOTEEN for the new DataFrame (US & Jamaica)
smote_eenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_eenn.fit_resample(X, y)

In [83]:
# Like before, we use LogisticRegression and then fit the model
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [84]:
# We make predictions and then check the accuracy score
y_pred = suicides_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5921807386535172

In [85]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[880, 165],
       [688, 358]])

In [86]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

     female       0.56      0.84      0.34      0.67      0.54      0.30      1045
       male       0.68      0.34      0.84      0.46      0.54      0.27      1046

avg / total       0.62      0.59      0.59      0.56      0.54      0.29      2091



In [87]:
# Looking at the results above we can see that the score achieved is very low (almost 60%) and this is because the 
# model is making predictions that come from too many factors which can skew the predictions to be more incorrect.
# We believe this can be from other columns in the dataset making interfering with the predictions. 
# These other factors can make the model think an individual is one gender (for example, predicting someone  
# is female) and the model ends up being incorrect (when it is actually male). When looking at the confusion 
# matrix, we can see that the majority of incorrect predictions were False Positives (with a minority of False
# Negatives) which means the model incorrectly categorized an individual with the wrong gender.


# Code with high accuracy

In [88]:
# Comparison between US & Jamaica between suicide rates
X = third_world_df[["suicides_no"]]

# Create our target
y = third_world_df["country"]

In [89]:
# Import train, test, split like before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(144, 1)

In [90]:
# In this Blooper we will use RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
ros_suicides = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_suicides.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'United States': 90, 'Jamaica': 90})

In [91]:
# Fit the model
suicides_model = LogisticRegression(solver='lbfgs', random_state=1)
suicides_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [92]:
# Predict based on X_test
y_pred = suicides_model.predict(X_test)

In [93]:
# Check balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

1.0

In [94]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[18,  0],
       [ 0, 30]])

In [95]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                     pre       rec       spe        f1       geo       iba       sup

      Jamaica       1.00      1.00      1.00      1.00      1.00      1.00        18
United States       1.00      1.00      1.00      1.00      1.00      1.00        30

  avg / total       1.00      1.00      1.00      1.00      1.00      1.00        48



In [96]:
# For this peice of code, we can see that we got an abnormally high score of 1.0 (100% accuracy). 
# This score may seem perfect but a score that high is the result of overfitting. Overfitting 
# happens when the model catching inaccurate values that are in the dataset. These factors reduce efficiency
# as well as the accuracy of the model. In this case the score was inflated upwards. This
# causes high variance in which the model is trying to learn everything it can This overfitted model treats 
# the extra "noise" in the dataset as values it can incorporate in the prediction.

# Benefits and Limitations of a Supervised Learning Model

In [97]:
# Benefits: When using a supervsed learning model, the process of using the algorithms was pretty strightforward. We
# can see how the machine works in order to get the data and asnwers we need when it comes to classifying relationships 
# found in the dataset. Also, a lot of the algorithms used had they same type of flow when it comes to typing out the code which 
# makes it simple to get things such as the accuracy score as well as an imblaanced classification report.



# Limitations: When working with thise code, there were some limitations to the model we used. One of the limitations
# is that we are unable to make any sort of predictions for future algorithms with the code itself. This was
# mainly because of the dataset itself in which it only goes up to 2016 (and we are currently in 2022). This limiation is also made very obvious when
# we have to try our best to make future predictions based on the data from six years ago. Another limitation is that with
# supervised learning, we need to use good examples that can provide a good enough relationship or else the accuracy
# we get can be very low. This is made especially difficult when the amount of data being used is very large and 
# there are a lot of unique identifiers that can lower the score on its own. 