# SMS Spam Classification - Data Preparation
This notebook contains functions to load, preprocess, and split the SMS spam dataset.

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string

## 1. Load Data

In [9]:
def load_data(file_path):
    """
    Load SMS spam collection data from file.

    Parameters:
    -----------
    file_path : str
        Path to the SMS spam collection file

    Returns:
    --------
    pd.DataFrame
        DataFrame with columns 'label' and 'message'
    """
    # Read tab-separated file
    df = pd.read_csv(file_path, sep='\t', names=['label', 'message'], encoding='utf-8')

    print(f"Data loaded successfully!")
    print(f"Total samples: {len(df)}")
    print(f"\nClass distribution:")
    print(df['label'].value_counts())
    print(f"\nSample data:")
    print(df.head())

    return df

## 2. Preprocess Data

In [10]:
def preprocess_data(df):
    """
    Preprocess the SMS data.

    Parameters:
    -----------
    df : pd.DataFrame
        Raw dataframe with 'label' and 'message' columns

    Returns:
    --------
    pd.DataFrame
        Preprocessed dataframe
    """
    df = df.copy()

    # Convert labels to binary (0 = ham, 1 = spam)
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})

    # Remove duplicates
    df = df.drop_duplicates(subset='message', keep='first')

    # Remove null values if any
    df = df.dropna()

    # Reset index
    df = df.reset_index(drop=True)

    print(f"Preprocessing complete!")
    print(f"Total samples after preprocessing: {len(df)}")
    print(f"\nClass distribution:")
    print(df['label'].value_counts())

    return df

## 3. Split Data

In [11]:
def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Split data into train, validation, and test sets.

    Parameters:
    -----------
    df : pd.DataFrame
        Preprocessed dataframe
    train_size : float
        Proportion of data for training (default: 0.7)
    val_size : float
        Proportion of data for validation (default: 0.15)
    test_size : float
        Proportion of data for testing (default: 0.15)
    random_state : int
        Random seed for reproducibility

    Returns:
    --------
    tuple
        (train_df, val_df, test_df)
    """
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Sizes must sum to 1.0"

    # First split: separate test set
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['label']
    )

    # Second split: separate train and validation
    val_ratio = val_size / (train_size + val_size)
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_ratio,
        random_state=random_state,
        stratify=train_val_df['label']
    )

    print(f"Data split complete!")
    print(f"\nTrain set: {len(train_df)} samples")
    print(train_df['label'].value_counts())
    print(f"\nValidation set: {len(val_df)} samples")
    print(val_df['label'].value_counts())
    print(f"\nTest set: {len(test_df)} samples")
    print(test_df['label'].value_counts())

    return train_df, val_df, test_df

## 4. Store Splits

In [12]:
def store_splits(train_df, val_df, test_df,
                 train_path='train.csv',
                 val_path='validation.csv',
                 test_path='test.csv'):
    """
    Store train, validation, and test splits to CSV files.

    Parameters:
    -----------
    train_df, val_df, test_df : pd.DataFrame
        DataFrames to store
    train_path, val_path, test_path : str
        File paths for storing the splits
    """
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    print(f"Splits saved successfully!")
    print(f"  - Train: {train_path}")
    print(f"  - Validation: {val_path}")
    print(f"  - Test: {test_path}")

## 5. Run Complete Pipeline

In [13]:
if __name__ == '__main__':
    # Load data
    df = load_data('SMSSpamCollection')

    # Preprocess data
    df_processed = preprocess_data(df)

    # Split data
    train_df, val_df, test_df = split_data(df_processed)

    # Store splits
    store_splits(train_df, val_df, test_df)

    print("\n" + "="*50)
    print("Data preparation complete! Ready for training.")
    print("="*50)

Data loaded successfully!
Total samples: 5572

Class distribution:
label
ham     4825
spam     747
Name: count, dtype: int64

Sample data:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Preprocessing complete!
Total samples after preprocessing: 5169

Class distribution:
label
0    4516
1     653
Name: count, dtype: int64
Data split complete!

Train set: 3617 samples
label
0    3160
1     457
Name: count, dtype: int64

Validation set: 776 samples
label
0    678
1     98
Name: count, dtype: int64

Test set: 776 samples
label
0    678
1     98
Name: count, dtype: int64
Splits saved successfully!
  - Train: train.csv
  - Validation: validation.csv
  - Test: test.csv

Data preparation comp