In [2]:
import os
import pandas as pd
import numpy as np
import re
import string
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def load_data(train_path, val_path, test_path):
    """
    Loads the train, validation, and test datasets from CSV files.
    """
    train_df = pd.read_csv(train_path)
    validation_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)

    # Convert text labels to numerical
    label_mapping = {'ham': 0, 'spam': 1}
    train_df['Label'] = train_df['Label'].map(label_mapping)
    validation_df['Label'] = validation_df['Label'].map(label_mapping)
    test_df['Label'] = test_df['Label'].map(label_mapping)

    return train_df, validation_df, test_df

def preprocess_text(text):
    """
    Cleans text data by lowercasing, removing punctuation, and numbers.
    """
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

def apply_preprocessing(df):
    """
    Applies text preprocessing to the 'Message' column of a dataframe.
    """
    df['Message'] = df['Message'].apply(preprocess_text)
    return df

def create_pipeline(model):
    """
    Creates a machine learning pipeline with TF-IDF vectorization and the specified classifier.
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', model)
    ])

def train_models(X_train, y_train, X_val, y_val):
    """
    Trains multiple models and selects the best performing one.
    """
    models = {
        "Naive Bayes": create_pipeline(MultinomialNB()),
        "Logistic Regression": create_pipeline(LogisticRegression(max_iter=1000)),
        "SVM": create_pipeline(SVC(kernel='linear', probability=True))
    }

    model_scores = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)

        # Evaluate on train and validation sets
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

        train_acc = accuracy_score(y_train, train_pred)
        val_acc = accuracy_score(y_val, val_pred)

        print(f"{name} - Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")
        model_scores[name] = val_acc

    best_model_name = max(model_scores, key=model_scores.get)
    best_model = models[best_model_name]
    print(f"\nBest Model Selected: {best_model_name}")
    return best_model_name, best_model

def fine_tune_hyperparameters(best_model_name, best_model, X_train, y_train):
    """
    Fine-tunes hyperparameters of the best selected model using Grid Search.
    """
    if best_model_name == "Logistic Regression":
        param_grid = {"clf__C": [0.01, 0.1, 1, 10]}
    elif best_model_name == "SVM":
        param_grid = {"clf__C": [0.01, 0.1, 1, 10, 100]}
    else:
        param_grid = {"clf__alpha": [0.01, 0.1, 1, 10]}  # Naive Bayes

    grid_search = GridSearchCV(best_model, param_grid, scoring='accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    print(f"\nBest hyperparameters: {grid_search.best_params_}")
    return best_model

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the trained model on the test set.
    """
    test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, test_pred)
    test_report = classification_report(y_test, test_pred)

    print(f"\nTest Accuracy: {test_acc:.4f}")
    print("\nTest Classification Report:\n", test_report)

def main():
    """
    Main function to execute the training pipeline.
    """
    # Set working directory
    os.chdir("/content/drive/MyDrive/AML_Assignments/Assignment1/")

    # Load datasets
    train_df, validation_df, test_df = load_data("train.csv", "validation.csv", "test.csv")

    # Preprocess text
    train_df = apply_preprocessing(train_df)
    validation_df = apply_preprocessing(validation_df)
    test_df = apply_preprocessing(test_df)


    # Split into features and labels
    X_train, y_train = train_df['Message'], train_df['Label']
    X_val, y_val = validation_df['Message'], validation_df['Label']
    X_test, y_test = test_df['Message'], test_df['Label']

    # Train models and select the best
    best_model_name, best_model = train_models(X_train, y_train, X_val, y_val)

    # Fine-tune the best model
    best_model = fine_tune_hyperparameters(best_model_name, best_model, X_train, y_train)

    # Evaluate the final model on the test set
    evaluate_model(best_model, X_test, y_test)

if __name__ == "__main__":
    main()


Training Naive Bayes...
Naive Bayes - Train Accuracy: 0.9791, Validation Accuracy: 0.9623
Training Logistic Regression...
Logistic Regression - Train Accuracy: 0.9670, Validation Accuracy: 0.9623
Training SVM...
SVM - Train Accuracy: 0.9948, Validation Accuracy: 0.9803

Best Model Selected: SVM

Best hyperparameters: {'clf__C': 100}

Test Accuracy: 0.9731

Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       483
           1       0.89      0.91      0.90        75

    accuracy                           0.97       558
   macro avg       0.94      0.95      0.94       558
weighted avg       0.97      0.97      0.97       558

