In [4]:

from google.colab import files
uploaded = files.upload()

Saving spam_ham_dataset.csv to spam_ham_dataset (1).csv


In [34]:
import pandas as pd
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("spam_ham_dataset.csv")

# Fix line breaks and fill NaNs in 'text' column
df['text'] = df['text'].fillna('').apply(lambda x: x.replace('\r\n', ' ').strip())

# Label mapping: ham → 0, spam → 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Drop rows where label mapping failed (to avoid NaNs)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Preprocessing function with spaCy + NER features
def preprocess_with_ner(text):
    doc = nlp(text)

    # Cleaned tokens: lemmatized, no stopwords/punct
    tokens = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ not in string.punctuation
    ]
    cleaned_text = " ".join(tokens).strip()

    # Named entity labels
    entity_labels = [ent.label_ for ent in doc.ents]

    # Binary flags for key entities
    has_money = int("MONEY" in entity_labels)
    has_org = int("ORG" in entity_labels)
    has_date = int("DATE" in entity_labels)

    return pd.Series([cleaned_text, has_money, has_org, has_date])

# Apply preprocessing function
df[['clean_text', 'has_money', 'has_org', 'has_date']] = df['text'].apply(preprocess_with_ner)

# Drop rows where clean_text is empty after preprocessing
df = df[df['clean_text'].str.strip() != '']

# Preview the result
print(df[['text', 'clean_text', 'has_money', 'has_org', 'has_date']].head(3))


                                                text  \
0  Subject: enron methanol ; meter # : 988291 thi...   
1  Subject: hpl nom for january 9 , 2001 ( see at...   
2  Subject: neon retreat ho ho ho , we ' re aroun...   

                                          clean_text  has_money  has_org  \
0  subject enron methanol meter 988291 follow not...          0        0   
1  subject hpl nom january 9 2001 attach file hpl...          0        1   
2  subject neon retreat ho ho ho wonderful time y...          0        0   

   has_date  
0         1  
1         1  
2         1  


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack
import numpy as np

# TF-IDF vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['clean_text'])

# Combine with Named Entity features
ner_features = df[['has_money', 'has_org', 'has_date']].values
X_combined = hstack([X_tfidf, ner_features])

# Labels
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9603864734299516
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       742
           1       0.92      0.94      0.93       293

    accuracy                           0.96      1035
   macro avg       0.95      0.95      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [38]:
def predict_spam(text):
    # Preprocess text using the same function
    cleaned_text, has_money, has_org, has_date = preprocess_with_ner(text)

    # Vectorize cleaned text
    vec = tfidf.transform([cleaned_text])

    # Stack with NER features
    ner_vector = np.array([has_money, has_org, has_date]).reshape(1, -1)
    full_input = hstack([vec, ner_vector])

    # Predict and return label
    prediction = model.predict(full_input)[0]
    return "Spam" if prediction == 1 else "Ham"


In [40]:
print(predict_spam("You've won a $1000 Amazon gift card! Claim now."))
print(predict_spam("Reminder: Your class is scheduled for Monday 10 AM."))
print(predict_spam("Congratulations! You've won a $500 gift card. Click to claim now."))



Spam
Ham
Spam
