In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import joblib
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../Input/preprocessed.csv', usecols=['processed_text', 'class'])
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)

Dataset shape: (232074, 2)
Shape of Suicide dataset: (116037, 2)
Shape of Non Suicide dataset: (116037, 2)


In [3]:
df.dropna(subset=['processed_text'], inplace=True)
print("After dropping null values from processed_text:")
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)

df.dropna(subset=['class'], inplace=True)
print("\nAfter dropping null values from class:")
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)

After dropping null values from processed_text:
Dataset shape: (232030, 2)
Shape of Suicide dataset: (116025, 2)
Shape of Non Suicide dataset: (116005, 2)

After dropping null values from class:
Dataset shape: (232030, 2)
Shape of Suicide dataset: (116025, 2)
Shape of Non Suicide dataset: (116005, 2)


In [4]:
X = df['processed_text']
y = df['class']

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [6]:
# joblib.dump(vectorizer, '../Models/vectorizer.pkl')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
outputs_dict = {}

In [81]:
# Logistic Regression
print("Starting Logistic Regression")
lr = LogisticRegression()
lr_param_grid = {
    'C': [0.01, 0.1, 1, 1],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': list(range(100, 1001, 100)),
}
grid_lr = GridSearchCV(lr, lr_param_grid, cv=5, verbose=3)
grid_lr.fit(X_train, y_train)
best_param_lr = grid_lr.best_params_
best_score_lr = grid_lr.best_score_
best_model_lr = grid_lr.best_estimator_
y_pred_lr = best_model_lr.predict(X_test)
outputs_dict['Logistic Regression'] = {
    'best_params': best_param_lr,
    'best_score': best_score_lr,
    'classification_report': classification_report(y_test, y_pred_lr, output_dict=True)
}
print("Completed Logistic Regression")
joblib.dump(best_model_lr, '../Models/logistic_regression_model.pkl')
print("Saved Logistic Regression model")

Starting Logistic Regression
Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV 1/5] END C=0.01, max_iter=100, penalty=l1, solver=liblinear;, score=0.867 total time=   2.1s
[CV 2/5] END C=0.01, max_iter=100, penalty=l1, solver=liblinear;, score=0.872 total time=   1.7s
[CV 3/5] END C=0.01, max_iter=100, penalty=l1, solver=liblinear;, score=0.871 total time=   1.3s
[CV 4/5] END C=0.01, max_iter=100, penalty=l1, solver=liblinear;, score=0.868 total time=   1.5s
[CV 5/5] END C=0.01, max_iter=100, penalty=l1, solver=liblinear;, score=0.868 total time=   1.3s
[CV 1/5] END C=0.01, max_iter=100, penalty=l1, solver=saga;, score=0.867 total time=   4.5s
[CV 2/5] END C=0.01, max_iter=100, penalty=l1, solver=saga;, score=0.871 total time=   4.5s
[CV 3/5] END C=0.01, max_iter=100, penalty=l1, solver=saga;, score=0.871 total time=   4.6s
[CV 4/5] END C=0.01, max_iter=100, penalty=l1, solver=saga;, score=0.868 total time=   4.5s
[CV 5/5] END C=0.01, max_iter=100, penalty=l1, solver=s

In [10]:
# Multinomial Naive Bayes
print("Starting Multinomial Naive Bayes")
mnb = MultinomialNB()
mnb_param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100],
}
grid_mnb = GridSearchCV(mnb, mnb_param_grid, cv=5, verbose=3)
grid_mnb.fit(X_train, y_train)
best_param_mnb = grid_mnb.best_params_
best_score_mnb = grid_mnb.best_score_
best_model_mnb = grid_mnb.best_estimator_
y_pred_mnb = best_model_mnb.predict(X_test)
outputs_dict['Multinomial Naive Bayes'] = {
    'best_params': best_param_mnb,
    'best_score': best_score_mnb,
    'classification_report': classification_report(y_test, y_pred_mnb, output_dict=True)
}
print("Completed Multinomial Naive Bayes")
joblib.dump(best_model_mnb, '../Models/multinomial_naive_bayes_model.pkl')
print("Saved Multinomial Naive Bayes model")

Starting Multinomial Naive Bayes
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ........................alpha=0.01;, score=0.902 total time=   0.4s
[CV 2/5] END ........................alpha=0.01;, score=0.902 total time=   0.5s
[CV 3/5] END ........................alpha=0.01;, score=0.900 total time=   0.4s
[CV 4/5] END ........................alpha=0.01;, score=0.900 total time=   0.4s
[CV 5/5] END ........................alpha=0.01;, score=0.903 total time=   0.4s
[CV 1/5] END ........................alpha=0.05;, score=0.904 total time=   0.4s
[CV 2/5] END ........................alpha=0.05;, score=0.903 total time=   0.4s
[CV 3/5] END ........................alpha=0.05;, score=0.901 total time=   0.4s
[CV 4/5] END ........................alpha=0.05;, score=0.901 total time=   0.4s
[CV 5/5] END ........................alpha=0.05;, score=0.905 total time=   0.4s
[CV 1/5] END .........................alpha=0.1;, score=0.903 total time=   0.5s
[CV 2/5] END ...

In [None]:
# Random Forest
print("Starting Random Forest")
rf = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [50],
    'max_depth': [None],
    'min_samples_split': [5],
    'min_samples_leaf': [1]
}
grid_rf = GridSearchCV(rf, rf_param_grid, cv=5, verbose=3, scoring='accuracy')
grid_rf.fit(X_train, y_train)
best_param_rf = grid_rf.best_params_
best_score_rf = grid_rf.best_score_
best_model_rf = grid_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)
outputs_dict['Random Forest'] = {
    'best_params': best_param_rf,
    'best_score': best_score_rf,
    'classification_report': classification_report(y_test, y_pred_rf, output_dict=True)
}
print("Completed Random Forest")
joblib.dump(best_model_rf, '../Models/random_forest_model.pkl')
print("Saved Random Forest model")

Starting Random Forest
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.899 total time=47.6min


In [25]:
# K-Nearest Neighbors
print("Starting K-Nearest Neighbors")
knn = KNeighborsClassifier()
knn_param_grid = {
    'n_neighbors': [1, 2, 3, 5, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['cosine', 'manhattan']
}
grid_knn = GridSearchCV(knn, knn_param_grid, cv=5, verbose=3)
grid_knn.fit(X_train, y_train)
best_param_knn = grid_knn.best_params_
best_score_knn = grid_knn.best_score_
best_model_knn = grid_knn.best_estimator_
y_pred_knn = best_model_knn.predict(X_test)
outputs_dict['K-Nearest Neighbors'] = {
    'best_params': best_param_knn,
    'best_score': best_score_knn,
    'classification_report': classification_report(y_test, y_pred_knn, output_dict=True)
}
print("Completed K-Nearest Neighbors")
joblib.dump(best_model_knn, '../Models/k_nearest_neighbors_model.pkl')
print("Saved K-Nearest Neighbors model")

Starting K-Nearest Neighbors
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END metric=cosine, n_neighbors=1, weights=uniform;, score=0.816 total time= 4.0min
[CV 2/5] END metric=cosine, n_neighbors=1, weights=uniform;, score=0.817 total time= 4.0min
[CV 3/5] END metric=cosine, n_neighbors=1, weights=uniform;, score=0.814 total time= 4.0min
[CV 4/5] END metric=cosine, n_neighbors=1, weights=uniform;, score=0.816 total time= 4.0min
[CV 5/5] END metric=cosine, n_neighbors=1, weights=uniform;, score=0.816 total time= 4.0min
[CV 1/5] END metric=cosine, n_neighbors=1, weights=distance;, score=0.816 total time= 4.0min
[CV 2/5] END metric=cosine, n_neighbors=1, weights=distance;, score=0.817 total time= 4.0min
[CV 3/5] END metric=cosine, n_neighbors=1, weights=distance;, score=0.814 total time= 4.0min
[CV 4/5] END metric=cosine, n_neighbors=1, weights=distance;, score=0.816 total time= 4.0min
[CV 5/5] END metric=cosine, n_neighbors=1, weights=distance;, score=0.816 tot

In [9]:
def print_metrics(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='suicide', average='binary')
    recall = recall_score(y_test, y_pred, pos_label='suicide', average='binary')
    f1 = f1_score(y_test, y_pred, pos_label='suicide', average='binary')
    print(f"\nMetrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

In [10]:
for model in [best_model_lr, best_model_mnb, best_model_knn, best_model_rf]:
    print_metrics(y_test, model.predict(X_test), model.__class__.__name__)


Metrics for LogisticRegression:
Accuracy: 0.9325
Precision: 0.9452
Recall: 0.9188
F1-Score: 0.9318

Metrics for MultinomialNB:
Accuracy: 0.9052
Precision: 0.8724
Recall: 0.9502
F1-Score: 0.9096

Metrics for KNeighborsClassifier:
Accuracy: 0.8754
Precision: 0.9165
Recall: 0.8272
F1-Score: 0.8696

Metrics for RandomForestClassifier:
Accuracy: 0.9030
Precision: 0.8965
Recall: 0.9123
F1-Score: 0.9043
