In [1]:
# Required Libraries (same as Colab)
import pandas as pd
import numpy as np
import string
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Set up stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def load_from_path(path: str) -> pd.DataFrame:
    """Loads SMS spam dataset from the given file path."""
    # Read with header=None to prevent first row being treated as header
    data = pd.read_csv(path, sep=',', header=None, names=['label', 'message'], encoding='latin-1')
    
    # Remove the first row if it contains column names
    if data.iloc[0, 0] == 'Label' and data.iloc[0, 1] == 'Message':
        data = data.iloc[1:].reset_index(drop=True)
    
    return data

# Define dataset path
path_sms_data = r"C:\Users\samar\OneDrive\Documents\AML 3\raw_data.csv"
data = load_from_path(path_sms_data)

# Display dataset info
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")
print(data.head())

Dataset loaded with 5572 rows and 2 columns.
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Encodes labels: 'ham' -> 0, 'spam' -> 1."""
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

data = encode_labels(data)
print(f"Label encoding completed. Spam Count: {sum(data['label'])}, Ham Count: {len(data) - sum(data['label'])}")

Label encoding completed. Spam Count: 747, Ham Count: 4825


In [6]:
def preprocess_text(text: str) -> str:
    """Cleans text by lowercasing, removing punctuation, numbers, and stopwords, then lemmatizing."""
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Tokenization (Using split instead of nltk.word_tokenize)
    tokens = text.split()  # No punkt required!

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization (Using WordNetLemmatizer)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

def preprocess_data(df: pd.DataFrame, text_col: str = 'message') -> pd.DataFrame:
    """Applies text preprocessing to the message column."""
    df[text_col] = df[text_col].apply(preprocess_text)
    return df

# Apply preprocessing
data_preprocessed = preprocess_data(data)

# Display sample output
print("\nPreprocessing completed. Sample output:")
print(data_preprocessed.head())


Preprocessing completed. Sample output:
   label                                            message
0      0  go jurong point crazy available bugis n great ...
1      0                            ok lar joking wif u oni
2      1  free entry wkly comp win fa cup final tkts st ...
3      0                u dun say early hor u c already say
4      0           nah dont think go usf life around though


In [None]:
def split_and_save_data(
    data: pd.DataFrame,
    output_path: str = './data_splits',  # Changed to local relative path
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    label_col: str = "label",
    random_state: int = 42
) -> None:
    """Splits data into train, validation, and test sets and saves them as CSV files."""
    os.makedirs(output_path, exist_ok=True)  # Create directory if not exists

    # Separate features (X) and labels (y)
    y = data[label_col]
    X = data.drop(columns=[label_col])

    # Split Train & Temporary (Validation + Test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1 - train_ratio), stratify=y, random_state=random_state
    )

    # Further split Validation & Test
    test_ratio = 1 - train_ratio - val_ratio
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)),
        stratify=y_temp, random_state=random_state
    )

    # Merge Labels Back
    train_data = pd.concat([y_train, X_train], axis=1)
    val_data = pd.concat([y_val, X_val], axis=1)
    test_data = pd.concat([y_test, X_test], axis=1)

    # Save Splits
    train_data.to_csv(os.path.join(output_path, 'train.csv'), index=False)
    val_data.to_csv(os.path.join(output_path, 'validation.csv'), index=False)
    test_data.to_csv(os.path.join(output_path, 'test.csv'), index=False)

    # Print Summary
    print("\nData Splitting Completed:")
    print(f"  Train: {len(train_data)} rows -> {os.path.join(output_path, 'train.csv')}")
    print(f"  Validation: {len(val_data)} rows -> {os.path.join(output_path, 'validation.csv')}")
    print(f"  Test: {len(test_data)} rows -> {os.path.join(output_path, 'test.csv')}")

# Run Data Splitting
split_and_save_data(data_preprocessed)

# Verify files were created
print("\nFiles created:")
print(os.listdir('./data_splits'))


Data Splitting Completed:
  Train: 4457 rows -> ./data_splits\train.csv
  Validation: 557 rows -> ./data_splits\validation.csv
  Test: 558 rows -> ./data_splits\test.csv

Files created:
['test.csv', 'train.csv', 'validation.csv']


: 