In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier

In [4]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yatha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
# Load dataset
df = pd.read_csv('C:/Users/Yatha/OneDrive/Documents/Sem VI/AOML/spam.csv', encoding='latin-1')
df = df.iloc[:, :2]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [8]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_train_clean, X_test_clean, _, _ = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

In [10]:
# Feature extraction (BoW)
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(X_train)
X_test_bow = vectorizer_bow.transform(X_test)

X_train_bow_clean = vectorizer_bow.fit_transform(X_train_clean)
X_test_bow_clean = vectorizer_bow.transform(X_test_clean)

In [11]:
# Feature extraction (TF-IDF)
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

X_train_tfidf_clean = vectorizer_tfidf.fit_transform(X_train_clean)
X_test_tfidf_clean = vectorizer_tfidf.transform(X_test_clean)

In [12]:
# Function to train models
def train_models(X_train, X_test, y_train, y_test):
    models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }
    
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
        print(f"{name} Accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
    return results

In [13]:
# Train models without cleaning
print("Results for BoW without Cleaning:")
train_models(X_train_bow, X_test_bow, y_train, y_test)

print("Results for TF-IDF without Cleaning:")
train_models(X_train_tfidf, X_test_tfidf, y_train, y_test)

Results for BoW without Cleaning:
Naive Bayes Accuracy: 0.9839
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Random Forest Accuracy: 0.9758
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.82      0.90       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9776
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.87      0.91       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Results for TF-IDF without Cleaning:
Naive Bayes Accuracy: 0.9623
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Random Forest Accuracy: 0.9749
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.82      0.90       150

    accuracy                           0.97      1115
   mac

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9767
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



{'Naive Bayes': 0.9623318385650225,
 'Random Forest': 0.9748878923766816,
 'XGBoost': 0.9766816143497757}

In [14]:
# Train models with cleaning
print("Results for BoW with Cleaning:")
train_models(X_train_bow_clean, X_test_bow_clean, y_train, y_test)

print("Results for TF-IDF with Cleaning:")
train_models(X_train_tfidf_clean, X_test_tfidf_clean, y_train, y_test)

Results for BoW with Cleaning:
Naive Bayes Accuracy: 0.9803
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Random Forest Accuracy: 0.9749
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

XGBoost Accuracy: 0.9713
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.96      0.82      0.88       150

    accuracy                           0.97      1115
   macro avg

Parameters: { "use_label_encoder" } are not used.



Naive Bayes Accuracy: 0.9677
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Random Forest Accuracy: 0.9731
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.80      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9632
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.94      0.78      0.85       150

    accuracy                           0.96      1115
   macro avg       0.95      0.89      0.91      1115
weighted avg       0.96      0.96      0.96      1115



{'Naive Bayes': 0.967713004484305,
 'Random Forest': 0.9730941704035875,
 'XGBoost': 0.9632286995515695}

In [15]:
# Ensemble model
ensemble = VotingClassifier(estimators=[
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
], voting='hard')


In [16]:
ensemble.fit(X_train_tfidf_clean, y_train)
y_pred_ensemble = ensemble.predict(X_test_tfidf_clean)
print("Ensemble Model Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print(classification_report(y_test, y_pred_ensemble))

Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [17]:
# display predictions
print("Predictions:")
df_pred = pd.DataFrame({'text': X_test, 'label': y_test, 'predicted_label': y_pred_ensemble})
print(df_pred)

Predictions:
                                                   text  label  \
3245  Funny fact Nobody teaches volcanoes 2 erupt, t...      0   
944   I sent my scores to sophas and i had to do sec...      0   
1044  We know someone who you know that fancies you....      1   
2484  Only if you promise your getting out as SOON a...      0   
812   Congratulations ur awarded either å£500 of CD ...      1   
...                                                 ...    ...   
4264   &lt;DECIMAL&gt; m but its not a common car he...      0   
2439  Rightio. 11.48 it is then. Well arent we all u...      0   
5556  Yes i have. So that's why u texted. Pshew...mi...      0   
4205                             Get the door, I'm here      0   
4293  Kit Strip - you have been billed 150p. Netcoll...      1   

      predicted_label  
3245                0  
944                 0  
1044                0  
2484                0  
812                 1  
...               ...  
4264                0  
24