# SMS Spam Classification - Model Training
This notebook contains functions to train, evaluate, and tune models for SMS spam classification.

In [79]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## 1. Loading, Preprocessing, Spliting and Storing Data

In [80]:
def load_data(file_path):
    """
    Load SMS spam collection data from file.

    Parameters:
    -----------
    file_path : str
        Path to the SMS spam collection file

    Returns:
    --------
    pd.DataFrame
        DataFrame with columns 'label' and 'message'
    """
    # Read tab-separated file
    df = pd.read_csv(file_path, sep='\t', names=['label', 'message'], encoding='utf-8')

    print(f"Data loaded successfully!")

    return df

In [81]:
def preprocess_data(df):
    """
    Preprocess the SMS data.

    Parameters:
    -----------
    df : pd.DataFrame
        Raw dataframe with 'label' and 'message' columns

    Returns:
    --------
    pd.DataFrame
        Preprocessed dataframe
    """
    df = df.copy()

    # Convert labels to binary (0 = ham, 1 = spam)
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})

    # Remove duplicates
    df = df.drop_duplicates(subset='message', keep='first')

    # Remove null values if any
    df = df.dropna()

    # Reset index
    df = df.reset_index(drop=True)

    print(f"Preprocessing complete!")
    return df

In [82]:
def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Split data into train, validation, and test sets.

    Parameters:
    -----------
    df : pd.DataFrame
        Preprocessed dataframe
    train_size : float
        Proportion of data for training (default: 0.7)
    val_size : float
        Proportion of data for validation (default: 0.15)
    test_size : float
        Proportion of data for testing (default: 0.15)
    random_state : int
        Random seed for reproducibility

    Returns:
    --------
    tuple
        (train_df, val_df, test_df)
    """
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Sizes must sum to 1.0"

    # First split: separate test set
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['label']
    )

    # Second split: separate train and validation
    val_ratio = val_size / (train_size + val_size)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_ratio,
        random_state=random_state,
        stratify=train_val_df['label']
    )

    print(f"Data split complete!")

    return train_df, val_df, test_df

In [83]:
def store_splits(train_df, val_df, test_df,
                 train_path='train.csv',
                 val_path='validation.csv',
                 test_path='test.csv'):
    """
    Store train, validation, and test splits to CSV files.

    Parameters:
    -----------
    train_df, val_df, test_df : pd.DataFrame
        DataFrames to store
    train_path, val_path, test_path : str
        File paths for storing the splits
    """
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    print(f"Splits saved successfully!")

## 2. Load Data Splits

In [84]:
def load_splits(train_path='train.csv', val_path='validation.csv', test_path='test.csv'):
    """
    Load train, validation, and test splits.

    Returns:
    --------
    tuple
        (train_df, val_df, test_df)
    """
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)

    print(f"Data loaded successfully!")
    print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

    return train_df, val_df, test_df

## 3. Feature Extraction

In [85]:
def create_features(train_df, val_df, test_df, max_features=3000):
    """
    Create TF-IDF features from text data.

    Parameters:
    -----------
    train_df, val_df, test_df : pd.DataFrame
        DataFrames with 'message' column
    max_features : int
        Maximum number of features for TF-IDF

    Returns:
    --------
    tuple
        (X_train, X_val, X_test, y_train, y_val, y_test, vectorizer)
    """
    vectorizer = TfidfVectorizer(max_features=max_features, lowercase=True, stop_words='english')

    # Fit on train and transform all sets
    X_train = vectorizer.fit_transform(train_df['message'])
    X_val = vectorizer.transform(val_df['message'])
    X_test = vectorizer.transform(test_df['message'])

    y_train = train_df['label'].values
    y_val = val_df['label'].values
    y_test = test_df['label'].values

    print(f"Features created!")
    print(f"Feature matrix shape: {X_train.shape}")
    print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

    return X_train, X_val, X_test, y_train, y_val, y_test, vectorizer

## 4. Fit Model

In [86]:
def fit_model(model, X_train, y_train):
    """
    Fit a model on training data.

    Parameters:
    -----------
    model : sklearn model
        Model to train
    X_train : array-like
        Training features
    y_train : array-like
        Training labels

    Returns:
    --------
    model
        Fitted model
    """
    print(f"Training {model.__class__.__name__}...")
    model.fit(X_train, y_train)
    print(f"Training complete!")
    return model

## 5. Score Model

In [87]:
def score_model(model, X, y):
    """
    Get predictions and basic accuracy score.

    Parameters:
    -----------
    model : sklearn model
        Trained model
    X : array-like
        Features
    y : array-like
        True labels

    Returns:
    --------
    tuple
        (predictions, accuracy)
    """
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    return y_pred, accuracy

## 6. Evaluate Model

In [88]:
def evaluate_model(y_true, y_pred, set_name=''):
    """
    Evaluate model predictions with detailed metrics.

    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    set_name : str
        Name of dataset (e.g., 'Train', 'Validation', 'Test')

    Returns:
    --------
    dict
        Dictionary containing all metrics
    """
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

    print(f"\n{'='*50}")
    print(f"{set_name} Set Evaluation")
    print(f"{'='*50}")
    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1-Score:  {metrics['f1']:.4f}")
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['ham', 'spam']))

    return metrics

## 7. Validate Model

In [89]:
def validate_model(model, X_train, y_train, X_val, y_val):
    """
    Fit model on train and evaluate on both train and validation.

    Parameters:
    -----------
    model : sklearn model
        Model to validate
    X_train, y_train : array-like
        Training data
    X_val, y_val : array-like
        Validation data

    Returns:
    --------
    tuple
        (train_metrics, val_metrics)
    """
    # Fit model
    model = fit_model(model, X_train, y_train)

    # Score on train
    y_train_pred, train_acc = score_model(model, X_train, y_train)
    print(f"\nTrain Accuracy: {train_acc:.4f}")

    # Score on validation
    y_val_pred, val_acc = score_model(model, X_val, y_val)
    print(f"Validation Accuracy: {val_acc:.4f}")

    # Evaluate
    train_metrics = evaluate_model(y_train, y_train_pred, 'Train')
    val_metrics = evaluate_model(y_val, y_val_pred, 'Validation')

    return train_metrics, val_metrics

## 8. Tune Hyperparameters

In [90]:
def tune_hyperparameters(model, param_grid, X_train, y_train, cv=3):
    """
    Perform grid search for hyperparameter tuning.

    Parameters:
    -----------
    model : sklearn model
        Base model to tune
    param_grid : dict
        Parameter grid for grid search
    X_train, y_train : array-like
        Training data
    cv : int
        Number of cross-validation folds

    Returns:
    --------
    model
        Best model from grid search
    """
    print(f"\nTuning hyperparameters for {model.__class__.__name__}...")
    print(f"Parameter grid: {param_grid}")

    grid_search = GridSearchCV(
        model, param_grid, cv=cv, scoring='f1',
        n_jobs=-1, verbose=1
    )

    grid_search.fit(X_train, y_train)

    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_

## 9. Train and Evaluate Benchmark Models

In [91]:
def train_benchmark_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Train three benchmark models and evaluate on test set.

    Models:
    1. Multinomial Naive Bayes
    2. Logistic Regression
    3. Random Forest

    Returns:
    --------
    dict
        Dictionary containing models and their test metrics
    """
    results = {}

    # Model 1: Multinomial Naive Bayes
    print("\n" + "#"*60)
    print("MODEL 1: MULTINOMIAL NAIVE BAYES")
    print("#"*60)

    nb_model = MultinomialNB()
    nb_train_metrics, nb_val_metrics = validate_model(nb_model, X_train, y_train, X_val, y_val)

    # Test evaluation
    y_test_pred, test_acc = score_model(nb_model, X_test, y_test)
    nb_test_metrics = evaluate_model(y_test, y_test_pred, 'Test')

    results['Naive Bayes'] = {
        'model': nb_model,
        'train_metrics': nb_train_metrics,
        'val_metrics': nb_val_metrics,
        'test_metrics': nb_test_metrics
    }

    # Model 2: Logistic Regression
    print("\n" + "#"*60)
    print("MODEL 2: LOGISTIC REGRESSION")
    print("#"*60)

    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_train_metrics, lr_val_metrics = validate_model(lr_model, X_train, y_train, X_val, y_val)

    # Test evaluation
    y_test_pred, test_acc = score_model(lr_model, X_test, y_test)
    lr_test_metrics = evaluate_model(y_test, y_test_pred, 'Test')

    results['Logistic Regression'] = {
        'model': lr_model,
        'train_metrics': lr_train_metrics,
        'val_metrics': lr_val_metrics,
        'test_metrics': lr_test_metrics
    }

    # Model 3: Random Forest
    print("\n" + "#"*60)
    print("MODEL 3: RANDOM FOREST")
    print("#"*60)

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf_train_metrics, rf_val_metrics = validate_model(rf_model, X_train, y_train, X_val, y_val)

    # Test evaluation
    y_test_pred, test_acc = score_model(rf_model, X_test, y_test)
    rf_test_metrics = evaluate_model(y_test, y_test_pred, 'Test')

    results['Random Forest'] = {
        'model': rf_model,
        'train_metrics': rf_train_metrics,
        'val_metrics': rf_val_metrics,
        'test_metrics': rf_test_metrics
    }

    return results

## 10. Select Best Model

In [92]:
def select_best_model(results):
    """
    Select the best model based on test F1 score.

    Parameters:
    -----------
    results : dict
        Dictionary containing model results

    Returns:
    --------
    tuple
        (best_model_name, best_model)
    """
    print("\n" + "="*60)
    print("MODEL COMPARISON (TEST SET)")
    print("="*60)

    comparison = []
    for name, result in results.items():
        metrics = result['test_metrics']
        comparison.append({
            'Model': name,
            'Accuracy': metrics['accuracy'],
            'Precision': metrics['precision'],
            'Recall': metrics['recall'],
            'F1-Score': metrics['f1']
        })

    comparison_df = pd.DataFrame(comparison)
    comparison_df = comparison_df.sort_values('F1-Score', ascending=False)
    print(comparison_df.to_string(index=False))

    best_model_name = comparison_df.iloc[0]['Model']
    best_model = results[best_model_name]['model']

    print(f"\n{'='*60}")
    print(f"BEST MODEL: {best_model_name}")
    print(f"Test F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")
    print(f"{'='*60}")

    return best_model_name, best_model

## 11. Run Complete Training Pipeline

In [93]:
if __name__ == '__main__':

    # Load data
    df = load_data('SMSSpamCollection')

    # Preprocess data
    df_processed = preprocess_data(df)

    # Split data
    train_df, val_df, test_df = split_data(df_processed)

    # Store splits
    store_splits(train_df, val_df, test_df)

    # Load data splits
    train_df, val_df, test_df = load_splits()

    # Create features
    X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = create_features(
        train_df, val_df, test_df
    )

    # Train benchmark models
    results = train_benchmark_models(X_train, y_train, X_val, y_val, X_test, y_test)

    # Select best model
    best_model_name, best_model = select_best_model(results)

    # Optional: Fine-tune best model
    print(f"\n{'='*60}")
    print("OPTIONAL: HYPERPARAMETER TUNING")
    print(f"{'='*60}")
    print("\nExample for Logistic Regression:")
    print("param_grid = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}")
    print("tuned_model = tune_hyperparameters(LogisticRegression(max_iter=1000), param_grid, X_train, y_train)")

    print("\n" + "="*60)
    print("TRAINING COMPLETE!")
    print("="*60)

Data loaded successfully!
Preprocessing complete!
Data split complete!
Splits saved successfully!
Data loaded successfully!
Train: 3617, Validation: 776, Test: 776
Features created!
Feature matrix shape: (3617, 3000)
Vocabulary size: 3000

############################################################
MODEL 1: MULTINOMIAL NAIVE BAYES
############################################################
Training MultinomialNB...
Training complete!

Train Accuracy: 0.9865
Validation Accuracy: 0.9704

Train Set Evaluation
Accuracy:  0.9865
Precision: 1.0000
Recall:    0.8928
F1-Score:  0.9434

Confusion Matrix:
[[3160    0]
 [  49  408]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      3160
        spam       1.00      0.89      0.94       457

    accuracy                           0.99      3617
   macro avg       0.99      0.95      0.97      3617
weighted avg       0.99      0.99      0.99      3617


Validation Set E