In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [4]:
NLTK_STOPWORDS = set(stopwords.words('english'))
NLTK_LEMMATIZER = WordNetLemmatizer()

def preprocess_text(text: str) -> str:
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = [NLTK_LEMMATIZER.lemmatize(word)
             for word in text.split()
             if word not in NLTK_STOPWORDS and len(word) > 1]
    return ' '.join(words)

In [5]:
if __name__ == "__main__":
    print("Starting Spam Detection Training...")

    df = pd.read_csv("spam_data.csv", encoding='latin-1')

    df = df[['sms', 'label']]
    df.dropna(subset=['sms', 'label'], inplace=True)
    df = df[df['sms'].apply(lambda x: isinstance(x, str))]
    df['label'] = pd.to_numeric(df['label'], errors='coerce').astype(int)

    df['processed_sms'] = df['sms'].apply(preprocess_text)

    X = df['processed_sms']
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    pipeline = Pipeline([
        ('tfidf_vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 1))),
        ('logistic_classifier', LogisticRegression(random_state=42, solver='liblinear', C=1.0, penalty='l2'))
    ])

    pipeline.fit(X_train, y_train)

    predictions = pipeline.predict(X_test)

    print("\nModel Performance:")
    print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
    print(f"Precision: {precision_score(y_test, predictions):.4f}")

Starting Spam Detection Training...

Model Performance:
Accuracy: 0.9670
Precision: 0.9930
