In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required packages
!pip install nltk scikit-learn scipy pandas numpy tqdm




In [3]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.sparse import csr_matrix, save_npz
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import joblib
import os


In [4]:
# Download NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


In [5]:

# 2. Set up paths
BASE_PATH = '/content/drive/MyDrive/SpamOrHam'
DATA_PATH = f'{BASE_PATH}/data/raw/spam_Emails_data.csv'
PROCESSED_PATH = f'{BASE_PATH}/data/processed'
MODEL_PATH = f'{BASE_PATH}/models'


In [6]:

# Create directories
for path in [PROCESSED_PATH, MODEL_PATH]:
    os.makedirs(path, exist_ok=True)


In [7]:

# 3. Define preprocessing functions
def preprocess_data(data):
    """
    Preprocess the dataset with only label and text columns.
    """
    # Convert labels
    y = np.where(data.iloc[:, 0].str.lower() == 'spam', 1, 0)

    # Get email text
    texts = data.iloc[:, 1].fillna('').values
    print(f"Total emails: {len(texts)}")

    # Build vocabulary
    print("Building vocabulary...")
    all_words = set()
    for text in tqdm(texts, desc="Building vocab"):
        if pd.isna(text) or not isinstance(text, str):
            continue
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        words = [stemmer.stem(word) for word in text.split()
                if word not in stop_words]
        all_words.update(words)

    vocab = sorted(list(all_words))
    print(f"Vocabulary size: {len(vocab)} words")

    # Create feature matrix
    print("\nConverting emails to feature vectors...")
    rows, cols, data_values = [], [], []

    for i, text in tqdm(enumerate(texts), total=len(texts)):
        if pd.isna(text) or not isinstance(text, str):
            continue
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        words = [stemmer.stem(word) for word in text.split()
                if word not in stop_words]
        for word in words:
            if word in vocab:
                rows.append(i)
                cols.append(vocab.index(word))
                data_values.append(1)

    X = csr_matrix((data_values, (rows, cols)),
                   shape=(len(texts), len(vocab)))
    return X, y, vocab


In [None]:

# 4. Load and preprocess data
print("Loading dataset...")
data = pd.read_csv(DATA_PATH, encoding='latin-1', usecols=[0,1])
X, y, vocab = preprocess_data(data)


Loading dataset...
Total emails: 193852
Building vocabulary...


Building vocab:   2%|▏         | 3331/193852 [00:12<19:00, 167.03it/s]

In [None]:

# 5. Train model
print("\nTraining model...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearSVC(random_state=42)
model.fit(X_train, y_train)


In [None]:

# 6. Evaluate model
print("\nModel evaluation:")
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training accuracy: {train_score:.4f}")
print(f"Testing accuracy: {test_score:.4f}")


In [None]:

# 7. Save model and vocabulary
print("\nSaving model and vocabulary...")
joblib.dump(model, f'{MODEL_PATH}/spam_classifier.pkl')
joblib.dump(vocab, f'{MODEL_PATH}/vocabulary.pkl')

print("Training complete! Model and vocabulary saved successfully.")