In [None]:
# @title Import datasheet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df=pd.read_csv("suicide_experiment.csv") # Load dataset
df.head()

Unnamed: 0,year,sex,age,suicides/100k pop,HDI for year,gdp_per_capita ($),generation
0,1987,male,15-24 years,6.71,0.793666,796.0,Generation X
1,1987,male,35-54 years,5.19,0.793666,796.0,Silent
2,1987,female,15-24 years,4.83,0.793666,796.0,Generation X
3,1987,male,75+ years,4.59,0.793666,796.0,G.I. Generation
4,1987,male,25-34 years,3.28,0.793666,796.0,Boomers


In [None]:
# @title Discretize dataset 'suicides/100k pop' column into 'Suicide Risk Level (High / Low)' based on the median value
median_suicides = df['suicides/100k pop'].median()

df['Suicide Risk Level'] = df['suicides/100k pop'].apply(lambda x: 'High' if x > median_suicides else 'Low')

print(f"Median 'suicides/100k pop': {median_suicides:.2f}")
display(df[['suicides/100k pop', 'Suicide Risk Level']].head())
display(df['Suicide Risk Level'].value_counts())
df.head()

Median 'suicides/100k pop': 4.29


Unnamed: 0,suicides/100k pop,Suicide Risk Level
0,6.71,High
1,5.19,High
2,4.83,High
3,4.59,High
4,3.28,Low


Unnamed: 0_level_0,count
Suicide Risk Level,Unnamed: 1_level_1
High,15878
Low,15878


Unnamed: 0,year,sex,age,suicides/100k pop,HDI for year,gdp_per_capita ($),generation,Suicide Risk Level
0,1987,male,15-24 years,6.71,0.793666,796.0,Generation X,High
1,1987,male,35-54 years,5.19,0.793666,796.0,Silent,High
2,1987,female,15-24 years,4.83,0.793666,796.0,Generation X,High
3,1987,male,75+ years,4.59,0.793666,796.0,G.I. Generation,High
4,1987,male,25-34 years,3.28,0.793666,796.0,Boomers,Low


In [None]:

# @title  Drop suicide 100k/pop
df_drop = df.copy()
df_drop = df_drop.drop('suicides/100k pop', axis=1)
df_drop.head()

Unnamed: 0,year,sex,age,HDI for year,gdp_per_capita ($),generation,Suicide Risk Level
0,1987,male,15-24 years,0.793666,796.0,Generation X,High
1,1987,male,35-54 years,0.793666,796.0,Silent,High
2,1987,female,15-24 years,0.793666,796.0,Generation X,High
3,1987,male,75+ years,0.793666,796.0,G.I. Generation,High
4,1987,male,25-34 years,0.793666,796.0,Boomers,Low


In [None]:
# @title Drop Sex, Age, Generation, Year, Country
# @title Drop Unwanted Columns

cols_to_drop = ["year", "age", "sex", "generation"]

df_exp = df_drop.drop(columns=cols_to_drop)

df_exp.head()




Unnamed: 0,HDI for year,gdp_per_capita ($),Suicide Risk Level
0,0.793666,796.0,High
1,0.793666,796.0,High
2,0.793666,796.0,High
3,0.793666,796.0,High
4,0.793666,796.0,Low


In [None]:
# @title Experiment




*   Dataset split into 80-20 and 70-30 train-test
*   We scale it with StandardScaler from sklearn.preprocessing

* Decision Tree, Random Forest, Logistic Regression, Naive Bayes, KNN, SVM, XGBoost and MLPClassifier (ANN) algortithms used.

* The experiment will be evaluate using Accuracy and F1-Score



In [None]:
# @title Split 80-20



In [None]:
X = df_exp.drop('Suicide Risk Level', axis=1) # Features
y = df_exp['Suicide Risk Level'] # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (25404, 2)
X_test shape: (6352, 2)
y_train shape: (25404,)
y_test shape: (6352,)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Scale 80-20 Train-Test
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Initialize StandardScaler
s_scaler = StandardScaler()

# Fit on X_train and transform both X_train and X_test
X_train_scaled = s_scaler.fit_transform(X_train)
X_test_scaled = s_scaler.transform(X_test)

print("X_train scaled shape:", X_train_scaled.shape)
print("X_test scaled shape:", X_test_scaled.shape)

# Convert scaled arrays back to DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("\nFirst 5 rows of scaled X_train_df:")
display(X_train_scaled_df.head())

print("\nFirst 5 rows of scaled X_test_df:")
display(X_test_scaled_df.head())

X_train scaled shape: (25404, 2)
X_test scaled shape: (6352, 2)

First 5 rows of scaled X_train_df:


Unnamed: 0,HDI for year,gdp_per_capita ($)
0,0.001471,0.848167
1,0.001471,-0.493375
2,0.001471,-0.693735
3,0.001471,-0.590358
4,-0.911596,-0.36296



First 5 rows of scaled X_test_df:


Unnamed: 0,HDI for year,gdp_per_capita ($)
0,0.001471,-0.306418
1,0.001471,-0.796242
2,2.484277,2.507513
3,-2.289432,-0.81839
4,0.001471,-0.791587


In [None]:
# @title Train Model for 80-20



Algorithm used for classification:


* Decision Tree
* Random Forest
* Logistic Regression
* Naive Bayes
* KNN
* SVM
* XGBoost
* MLPClassifier (ANN)







In [None]:
# @title Train Model for 80-20

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print("Encoded classes:", le.classes_)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train_enc)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train_enc)

# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train_enc)

print("Tree-based models trained (unscaled)")

# Logistic Regression
lr_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
lr_model.fit(X_train_scaled_df, y_train_enc)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train_scaled_df, y_train_enc)

# KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled_df, y_train_enc)

# SVM
svm_model = SVC(random_state=42, probability=True)
svm_model.fit(X_train_scaled_df, y_train_enc)

# ANN (MLP)
ann_model = MLPClassifier(random_state=42, max_iter=500)
ann_model.fit(X_train_scaled_df, y_train_enc)

print("Scale-sensitive models trained (scaled)")


Encoded classes: ['High' 'Low']
Tree-based models trained (unscaled)
Scale-sensitive models trained (scaled)


In [None]:
# @title Model Evaluation (Accuracy & F1-Score)

from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

models = {
    "Decision Tree": (dt_model, X_test),
    "Random Forest": (rf_model, X_test),
    "XGBoost": (xgb_model, X_test),
    "Logistic Regression": (lr_model, X_test_scaled_df),
    "Naive Bayes": (nb_model, X_test_scaled_df),
    "KNN": (knn_model, X_test_scaled_df),
    "SVM": (svm_model, X_test_scaled_df),
    "ANN (MLP)": (ann_model, X_test_scaled_df),
}
results = []

for name, (model, X_eval) in models.items():
    y_pred = model.predict(X_eval)

    acc = accuracy_score(y_test_enc, y_pred)
    f1  = f1_score(y_test_enc, y_pred, average='binary')

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1-Score": f1
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-Score", ascending=False)

results_df



Unnamed: 0,Model,Accuracy,F1-Score
1,Random Forest,0.718986,0.699039
0,Decision Tree,0.721505,0.693255
5,KNN,0.702928,0.685448
2,XGBoost,0.65932,0.613019
3,Logistic Regression,0.536839,0.575591
7,ANN (MLP),0.584698,0.528086
6,SVM,0.578558,0.428846
4,Naive Bayes,0.545025,0.378762


In [None]:
# @title Split 70-30

from sklearn.model_selection import train_test_split

X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("X_train_70 shape:", X_train_70.shape)
print("X_test_30 shape:", X_test_30.shape)
print("y_train_70 shape:", y_train_70.shape)
print("y_test_30 shape:", y_test_30.shape)


X_train_70 shape: (22229, 2)
X_test_30 shape: (9527, 2)
y_train_70 shape: (22229,)
y_test_30 shape: (9527,)


In [None]:
# @title Scale 70-30 Train-Test

from sklearn.preprocessing import StandardScaler
import pandas as pd

# Initialize StandardScaler
s_scaler_70 = StandardScaler()

# Fit on X_train_70 and transform both X_train_70 and X_test_30
X_train_70_scaled = s_scaler_70.fit_transform(X_train_70)
X_test_30_scaled = s_scaler_70.transform(X_test_30)

print("X_train_70 scaled shape:", X_train_70_scaled.shape)
print("X_test_30 scaled shape:", X_test_30_scaled.shape)

# Convert scaled arrays back to DataFrame
X_train_70_scaled_df = pd.DataFrame(
    X_train_70_scaled, columns=X_train_70.columns
)
X_test_30_scaled_df = pd.DataFrame(
    X_test_30_scaled, columns=X_test_30.columns
)

print("\nFirst 5 rows of scaled X_train_70_df:")
display(X_train_70_scaled_df.head())

print("\nFirst 5 rows of scaled X_test_30_df:")
display(X_test_30_scaled_df.head())



X_train_70 scaled shape: (22229, 2)
X_test_30 scaled shape: (9527, 2)

First 5 rows of scaled X_train_70_df:


Unnamed: 0,HDI for year,gdp_per_capita ($)
0,0.001101,-0.099835
1,0.001101,-0.868811
2,0.001101,-0.7741
3,-0.174313,0.649336
4,0.001101,-0.02893



First 5 rows of scaled X_test_30_df:


Unnamed: 0,HDI for year,gdp_per_capita ($)
0,2.069183,1.587417
1,0.001101,-0.441944
2,0.001101,1.501992
3,0.001101,1.582389
4,0.222203,-0.126647


In [None]:
# @title Train Model for 70-30

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Encode y
le_70 = LabelEncoder()
y_train_70_enc = le_70.fit_transform(y_train_70)
y_test_30_enc  = le_70.transform(y_test_30)

print("Encoded classes:", le_70.classes_)


# ----------------------------
# Tree-based models (UNSCALED)
# ----------------------------

# Decision Tree
dt_model_70 = DecisionTreeClassifier(random_state=42)
dt_model_70.fit(X_train_70, y_train_70_enc)

# Random Forest
rf_model_70 = RandomForestClassifier(random_state=42)
rf_model_70.fit(X_train_70, y_train_70_enc)

# XGBoost
xgb_model_70 = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model_70.fit(X_train_70, y_train_70_enc)

print("Tree-based models trained (70-30, unscaled)")


# ---------------------------------
# Scale-sensitive models (SCALED)
# ---------------------------------

# Logistic Regression
lr_model_70 = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
lr_model_70.fit(X_train_70_scaled_df, y_train_70_enc)

# Naive Bayes
nb_model_70 = GaussianNB()
nb_model_70.fit(X_train_70_scaled_df, y_train_70_enc)

# KNN
knn_model_70 = KNeighborsClassifier()
knn_model_70.fit(X_train_70_scaled_df, y_train_70_enc)

# SVM
svm_model_70 = SVC(random_state=42, probability=True)
svm_model_70.fit(X_train_70_scaled_df, y_train_70_enc)

# ANN (MLP)
ann_model_70 = MLPClassifier(random_state=42, max_iter=500)
ann_model_70.fit(X_train_70_scaled_df, y_train_70_enc)

print("Scale-sensitive models trained (70-30, scaled)")



Encoded classes: ['High' 'Low']
Tree-based models trained (70-30, unscaled)
Scale-sensitive models trained (70-30, scaled)


In [None]:
# @title Model Evaluation (Accuracy & F1-Score) - 70-30

from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

models_70_30 = {
    "Decision Tree": (dt_model_70, X_test_30),
    "Random Forest": (rf_model_70, X_test_30),
    "XGBoost": (xgb_model_70, X_test_30),
    "Logistic Regression": (lr_model_70, X_test_30_scaled_df),
    "Naive Bayes": (nb_model_70, X_test_30_scaled_df),
    "KNN": (knn_model_70, X_test_30_scaled_df),
    "SVM": (svm_model_70, X_test_30_scaled_df),
    "ANN (MLP)": (ann_model_70, X_test_30_scaled_df),
}

results_70_30 = []

for name, (model, X_eval) in models_70_30.items():
    y_pred = model.predict(X_eval)

    acc = accuracy_score(y_test_30_enc, y_pred)
    f1  = f1_score(y_test_30_enc, y_pred, average="binary")

    results_70_30.append({
        "Model": name,
        "Accuracy": acc,
        "F1-Score": f1
    })

results_70_30_df = pd.DataFrame(results_70_30)
results_70_30_df = results_70_30_df.sort_values(by="F1-Score", ascending=False)

results_70_30_df


Unnamed: 0,Model,Accuracy,F1-Score
1,Random Forest,0.714706,0.695018
0,Decision Tree,0.714076,0.683624
5,KNN,0.700745,0.682411
2,XGBoost,0.650677,0.608747
3,Logistic Regression,0.53847,0.576764
7,ANN (MLP),0.584759,0.549943
6,SVM,0.573213,0.421292
4,Naive Bayes,0.542353,0.375358
