In [16]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score

In [17]:

# Read the processed training data
train_data = pd.read_csv('processed_data/train.csv', sep='\t')
val_data = pd.read_csv('processed_data/validation.csv', sep='\t')

# Split features and target
X_train = train_data['preprocessed_message']
X_train = X_train.fillna('')
y_train = train_data['label']

X_val = val_data['preprocessed_message']
X_val = X_val.fillna('')
y_val = val_data['label']

# Convert text to TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)


In [23]:
nb_classifier = MultinomialNB()
svm_classifier = LinearSVC(random_state=42)
lr_classifier = LogisticRegression(random_state=42)

nb_classifier.fit(X_train_tfidf, y_train)
svm_classifier.fit(X_train_tfidf, y_train)
lr_classifier.fit(X_train_tfidf, y_train)

nb_pred = nb_classifier.predict(X_val_tfidf)
svm_pred = svm_classifier.predict(X_val_tfidf)
lr_pred = lr_classifier.predict(X_val_tfidf)

def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

classifiers = ['Naive Bayes', 'SVM', 'Logistic Regression']
predictions = [nb_pred, svm_pred, lr_pred]

print("Model Performance Metrics:\n")
print("Classifier\t\t\tAccuracy\tF1 Score\tRecall\t\tPrecision\tSpecificity\tAUC")
print("-" * 120)

for clf, pred in zip(classifiers, predictions):
    acc = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred)
    r = recall_score(y_val, pred)
    p = precision_score(y_val, pred)
    auc = roc_auc_score(y_val, pred)
    s = specificity(y_val, pred)
    print(f"{clf:<20}\t\t{acc:.4f}\t\t{f1:.4f}\t\t{r:.4f}\t\t{p:.4f}\t\t{s:.4f}\t\t{auc:.4f}")


Model Performance Metrics:

Classifier			Accuracy	F1 Score	Recall		Precision	Specificity	AUC
------------------------------------------------------------------------------------------------------------------------
Naive Bayes         		0.9596		0.8263		0.7039		1.0000		1.0000		0.8520
SVM                 		0.9785		0.9178		0.8816		0.9571		0.9938		0.9377
Logistic Regression 		0.9498		0.7863		0.6776		0.9364		0.9927		0.8352


In [25]:
# Read the processed training data
test_data = pd.read_csv('processed_data/test.csv', sep='\t')

# Split features and target
X_test = test_data['preprocessed_message']
X_test = X_test.fillna('')
y_test = test_data['label']

X_test_tfidf = tfidf.transform(X_test)

In [26]:
nb_pred = nb_classifier.predict(X_test_tfidf)
svm_pred = svm_classifier.predict(X_test_tfidf)
lr_pred = lr_classifier.predict(X_test_tfidf)

In [27]:
classifiers = ['Naive Bayes', 'SVM', 'Logistic Regression']
predictions = [nb_pred, svm_pred, lr_pred]

print("Model Performance Metrics:\n")
print("Classifier\t\t\tAccuracy\tF1 Score\tRecall\t\tPrecision\tSpecificity\tAUC")
print("-" * 120)

for clf, pred in zip(classifiers, predictions):
    acc = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    r = recall_score(y_test, pred)
    p = precision_score(y_test, pred)
    auc = roc_auc_score(y_test, pred)
    s = specificity(y_test, pred)
    print(f"{clf:<20}\t\t{acc:.4f}\t\t{f1:.4f}\t\t{r:.4f}\t\t{p:.4f}\t\t{s:.4f}\t\t{auc:.4f}")

Model Performance Metrics:

Classifier			Accuracy	F1 Score	Recall		Precision	Specificity	AUC
------------------------------------------------------------------------------------------------------------------------
Naive Bayes         		0.9659		0.8538		0.7450		1.0000		1.0000		0.8725
SVM                 		0.9848		0.9404		0.8993		0.9853		0.9979		0.9486
Logistic Regression 		0.9570		0.8110		0.6913		0.9810		0.9979		0.8446
