### Imports

In [18]:
import pandas as pd, warnings, numpy as np, joblib
from typing import Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

### Model Selection

- Uses **TF-IDF vectorization** for text feature extraction.  
- Implements multiple **ML models** (Naive Bayes, SVM, Logistic Regression, Neural Network, Random Forest).  
- Optimizes models using **GridSearchCV** for hyperparameter tuning.  
- Evaluates models with **accuracy, confusion matrix, ROC, and Precision-Recall curves**.  
- Checks for **overfitting/underfitting** using learning curves and score comparison.  
- Analyzes **data drift** between training and test sets.  
- Selects the **best model**, saves it, and ranks all models based on performance.

In [23]:
SEED = 8576

# Loading processed data
TRAIN_DATA = pd.read_csv("train.csv")
VALIDATION_DATA = pd.read_csv("validation.csv")
TEST_DATA = pd.read_csv("test.csv")
N_JOBS = 1
SKIP_FINE_TUNING = False

# Benchmarking multiple models
MODELS = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(random_state = SEED),
    "Support Vector Machine": SVC(random_state = SEED),
}

In [24]:
MODEL = MultinomialNB | RandomForestClassifier | SVC

# Function to fit a model on train data
def fit_model(
    train_data: pd.DataFrame,
    vectorizer: TfidfVectorizer,
    model: MODEL,
) -> Tuple[MODEL, TfidfVectorizer]:
    X_train = vectorizer.fit_transform(train_data["text"])
    y_train = train_data["category"]

    model.fit(X_train, y_train)

    return model, vectorizer

# Function to score a model on given data
def score_model(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    data: pd.DataFrame,
) -> Tuple[float, str | dict, np.ndarray]:
    X = vectorizer.transform(data["text"])
    y_true = data["category"]

    y_pred = model.predict(X)

    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names = ["Ham", "Spam"])
    confusion_mat = confusion_matrix(y_true, y_pred)

    return accuracy, report, confusion_mat

# Function to validate the model using cross-validation
def cross_validate_model(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    data: pd.DataFrame,
) -> float:
    X = vectorizer.transform(data["text"])
    y = data["category"]

    scores = cross_val_score(model, X, y, cv = 5, scoring = "accuracy")

    return scores.mean()

# Function to score and evaluate model
def score_and_evaluate_model_and_get_accuracy(
    model: MODEL,
    vectorizer: TfidfVectorizer,
    check_data: pd.DataFrame,
    check_data_type: str,
) -> float:
    check_accuracy, check_classification_report, check_confusion_mat = score_model(model, vectorizer, check_data)
    print(f"{check_data_type} Accuracy: {round(100 * check_accuracy, 2)} %")
    print(f"{check_data_type} Classification Report:\n{check_classification_report}")
    print(f"{check_data_type} Confusion Matrix:")
    return check_accuracy

In [25]:
best_model = None
best_model_vectorizer = None
best_accuracy = 0

In [26]:
model, vectorizer = fit_model(TRAIN_DATA, TfidfVectorizer(), MODELS["Multinomial Naive Bayes"])

In [27]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TRAIN_DATA, "Train Data")

Train Data Accuracy: 97.08 %
Train Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.98      3388
        Spam       1.00      0.78      0.87       512

    accuracy                           0.97      3900
   macro avg       0.98      0.89      0.93      3900
weighted avg       0.97      0.97      0.97      3900

Train Data Confusion Matrix:


In [28]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, VALIDATION_DATA, "Validation Data")

Validation Data Accuracy: 94.98 %
Validation Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.94      1.00      0.97       712
        Spam       1.00      0.66      0.80       124

    accuracy                           0.95       836
   macro avg       0.97      0.83      0.88       836
weighted avg       0.95      0.95      0.95       836

Validation Data Confusion Matrix:


In [29]:
# Score on test
test_accuracy = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TEST_DATA, "Test Data")

# Select the best model based on accuracy
if test_accuracy > best_accuracy:
    best_accuracy = test_accuracy
    best_model = model
    best_model_vectorizer = vectorizer

Test Data Accuracy: 95.33 %
Test Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.95      1.00      0.97       725
        Spam       1.00      0.65      0.79       111

    accuracy                           0.95       836
   macro avg       0.97      0.82      0.88       836
weighted avg       0.96      0.95      0.95       836

Test Data Confusion Matrix:


In [30]:
model, vectorizer = fit_model(TRAIN_DATA, TfidfVectorizer(), MODELS["Random Forest"])

In [31]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, TRAIN_DATA, "Train Data")

Train Data Accuracy: 100.0 %
Train Data Classification Report:
              precision    recall  f1-score   support

         Ham       1.00      1.00      1.00      3388
        Spam       1.00      1.00      1.00       512

    accuracy                           1.00      3900
   macro avg       1.00      1.00      1.00      3900
weighted avg       1.00      1.00      1.00      3900

Train Data Confusion Matrix:


In [32]:
_ = score_and_evaluate_model_and_get_accuracy(model, vectorizer, VALIDATION_DATA, "Validation Data")

Validation Data Accuracy: 97.61 %
Validation Data Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       712
        Spam       1.00      0.84      0.91       124

    accuracy                           0.98       836
   macro avg       0.99      0.92      0.95       836
weighted avg       0.98      0.98      0.98       836

Validation Data Confusion Matrix:
