In [1]:
# Required Libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Set up stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### Step 1: Load the Dataset
def load_from_path(path: str) -> pd.DataFrame:
    """Loads SMS spam dataset from the given file path."""
    data = pd.read_csv(path, sep='\t', names=['label', 'message'], encoding='latin-1')
    return data


In [3]:
# Define dataset path (
path_sms_data = '/content/SMSSpamCollection'
data = load_from_path(path_sms_data)

In [4]:
# Display dataset shape and first few rows
print(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")
print(data.head())

Dataset loaded with 5572 rows and 2 columns.
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
### Step 2: Label Encoding
def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Encodes labels: 'ham' -> 0, 'spam' -> 1."""
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

data = encode_labels(data)
print(f"Label encoding completed. Spam Count: {sum(data['label'])}, Ham Count: {len(data) - sum(data['label'])}")

Label encoding completed. Spam Count: 747, Ham Count: 4825


In [7]:
### Step 3: Preprocessing Function (Without NLTK Tokenizer)
def preprocess_text(text: str) -> str:
    """Cleans text by lowercasing, removing punctuation, numbers, and stopwords, then lemmatizing."""
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Tokenization (Using split instead of nltk.word_tokenize)
    tokens = text.split()  # No punkt required!

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization (Using WordNetLemmatizer)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

### Step 4: Apply Preprocessing
def preprocess_data(df: pd.DataFrame, text_col: str = 'message') -> pd.DataFrame:
    """Applies text preprocessing to the message column."""
    df[text_col] = df[text_col].apply(preprocess_text)
    return df

# Apply preprocessing
data_preprocessed = preprocess_data(data)

# Display sample output
print("\nPreprocessing completed. Sample output:")
print(data_preprocessed.head())




Preprocessing completed. Sample output:
   label                                            message
0      0  go jurong point crazy available bugis n great ...
1      0                            ok lar joking wif u oni
2      1  free entry wkly comp win fa cup final tkts st ...
3      0                u dun say early hor u c already say
4      0           nah dont think go usf life around though


In [15]:
### Step 5: Splitting the Data and Saving Locally
def split_and_save_data(
    data: pd.DataFrame,
    output_path: str = 'data_splits',
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    label_col: str = "label",
    random_state: int = 42
) -> None:
    """Splits data into train, validation, and test sets and saves them as CSV files."""

    os.makedirs(output_path, exist_ok=True)  # Create directory if not exists

    # Separate features (X) and labels (y)
    y = data[label_col]
    X = data.drop(columns=[label_col])

    # Split Train & Temporary (Validation + Test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(1 - train_ratio), stratify=y, random_state=random_state
    )

    # Further split Validation & Test
    test_ratio = 1 - train_ratio - val_ratio
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)),
        stratify=y_temp, random_state=random_state
    )

    # Merge Labels Back
    train_data = pd.concat([y_train, X_train], axis=1)
    val_data = pd.concat([y_val, X_val], axis=1)
    test_data = pd.concat([y_test, X_test], axis=1)

    # Save Splits
    train_data.to_csv(os.path.join(output_path, 'train.csv'), index=False)
    val_data.to_csv(os.path.join(output_path, 'validation.csv'), index=False)
    test_data.to_csv(os.path.join(output_path, 'test.csv'), index=False)

    # Print Summary
    print("\nData Splitting Completed:")
    print(f"  Train: {len(train_data)} rows -> {os.path.join(output_path, 'train.csv')}")
    print(f"  Validation: {len(val_data)} rows -> {os.path.join(output_path, 'validation.csv')}")
    print(f"  Test: {len(test_data)} rows -> {os.path.join(output_path, 'test.csv')}")

# Define output directory for saving files
output_path = '/content/data_splits'  # Change this to your desired output directory

# Run Data Splitting
split_and_save_data(data_preprocessed, output_path=output_path)

print("\nProcessing complete. The dataset has been cleaned, split, and saved.")



Data Splitting Completed:
  Train: 4457 rows -> /content/data_splits/train.csv
  Validation: 557 rows -> /content/data_splits/validation.csv
  Test: 558 rows -> /content/data_splits/test.csv

Processing complete. The dataset has been cleaned, split, and saved.


In [16]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import os

# Define your Google Drive folder path
drive_folder = "/content/drive/My Drive/AppliedMachineLearning/Assignment_1"

# Create the directory structure
os.makedirs(drive_folder, exist_ok=True)

print(f"Assignment folder created at: {drive_folder}")


Assignment folder created at: /content/drive/My Drive/AppliedMachineLearning/Assignment_1


In [17]:
import os

# Ensure 'data_splits' folder exists
os.makedirs("data_splits", exist_ok=True)

# Save the train, validation, and test datasets again
data_preprocessed.to_csv("data_splits/train.csv", index=False)
split_and_save_data(data_preprocessed, output_path="data_splits")

# Verify that files exist
print("✅ All CSV files have been saved successfully in 'data_splits'.")
!ls -lh data_splits/



Data Splitting Completed:
  Train: 4457 rows -> data_splits/train.csv
  Validation: 557 rows -> data_splits/validation.csv
  Test: 558 rows -> data_splits/test.csv
✅ All CSV files have been saved successfully in 'data_splits'.
total 300K
-rw-r--r-- 1 root root  30K Jan 30 21:38 test.csv
-rw-r--r-- 1 root root 235K Jan 30 21:38 train.csv
-rw-r--r-- 1 root root  31K Jan 30 21:38 validation.csv
