In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../Input/preprocessed.csv', usecols=['processed_text', 'class'])
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)

Dataset shape: (232074, 2)
Shape of Suicide dataset: (116037, 2)
Shape of Non Suicide dataset: (116037, 2)


In [3]:
df.dropna(subset=['processed_text'], inplace=True)
print("\nAfter dropping null values from processed_text:")
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)

df.dropna(subset=['class'], inplace=True)
print("\nAfter dropping null values from class:")
print("Dataset shape:", df.shape)
print("Shape of Suicide dataset:", df[df['class'] == 'suicide'].shape)
print("Shape of Non Suicide dataset:", df[df['class'] == 'non-suicide'].shape)


After dropping null values from processed_text:
Dataset shape: (232030, 2)
Shape of Suicide dataset: (116025, 2)
Shape of Non Suicide dataset: (116005, 2)

After dropping null values from class:
Dataset shape: (232030, 2)
Shape of Suicide dataset: (116025, 2)
Shape of Non Suicide dataset: (116005, 2)


In [4]:
X = df['processed_text']
y = df['class']

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='suicide')
    recall = recall_score(y_test, y_pred, pos_label='suicide')
    f1 = f1_score(y_test, y_pred, pos_label='suicide')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30, "\n")

In [8]:
model_filenames = [
    '../Models/k_nearest_neighbors_model.pkl',
    '../Models/logistic_regression_model.pkl',
    '../Models/multinomial_naive_bayes_model.pkl',
    '../Models/random_forest_model.pkl'
]

In [14]:
for filename in model_filenames:
    model = joblib.load(open(filename, 'rb'))
    print(f"Evaluating {filename} model")
    evaluate_model(model, X_test, y_test)

Evaluating ../Models/k_nearest_neighbors_model.pkl model
Accuracy: 0.8754
Precision: 0.9165
Recall: 0.8272
F1 Score: 0.8696
------------------------------ 

Evaluating ../Models/logistic_regression_model.pkl model
Accuracy: 0.9325
Precision: 0.9452
Recall: 0.9188
F1 Score: 0.9318
------------------------------ 

Evaluating ../Models/multinomial_naive_bayes_model.pkl model
Accuracy: 0.9052
Precision: 0.8724
Recall: 0.9502
F1 Score: 0.9096
------------------------------ 

Evaluating ../Models/random_forest_model.pkl model
Accuracy: 0.8915
Precision: 0.9008
Recall: 0.8811
F1 Score: 0.8908
------------------------------ 

