In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
df['label'] = df['label'].map({'ham':0, 'spam':1})

Text Cleaning (lowercase, remove punctuation)
Remove stopwords

In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned'] = df['message'].apply(clean_text)

Convert text using CountVectorizer and TF-IDF

In [6]:
cv = CountVectorizer()
X = cv.fit_transform(df['cleaned'])
y = df['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train Models
1.Naive Bayes

In [8]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.9757847533632287
[[951  14]
 [ 13 137]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.91      0.91       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Logistic Regression

In [9]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9775784753363229


In [12]:

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy: 0.9775784753363229

Confusion Matrix:
 [[965   0]
 [ 25 125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Support Vector Machine

In [10]:
svm = SVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.9730941704035875


In [13]:
svm = SVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

Accuracy: 0.9730941704035875

Confusion Matrix:
 [[964   1]
 [ 29 121]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [11]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned'])

Compare CountVectorizer vs TF-IDF Results

In [19]:
# Train models with CountVectorizer (already have  from earlier)
cv = CountVectorizer()
X_cv = cv.fit_transform(df['cleaned'])
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, df['label'], test_size=0.2, random_state=42)

# Logistic Regression with CountVectorizer
lr_cv = LogisticRegression(max_iter=1000)
lr_cv.fit(X_train_cv, y_train_cv)
y_pred_lr_cv = lr_cv.predict(X_test_cv)
lr_cv_acc = accuracy_score(y_test_cv, y_pred_lr_cv)

# SVM with CountVectorizer
svm_cv = SVC()
svm_cv.fit(X_train_cv, y_train_cv)
y_pred_svm_cv = svm_cv.predict(X_test_cv)
svm_cv_acc = accuracy_score(y_test_cv, y_pred_svm_cv)

# Train models with TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned'])
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['label'], test_size=0.2, random_state=42)

# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)
lr_tfidf_acc = accuracy_score(y_test_tfidf, y_pred_lr_tfidf)

# SVM with TF-IDF
svm_tfidf = SVC()
svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_svm_tfidf = svm_tfidf.predict(X_test_tfidf)
svm_tfidf_acc = accuracy_score(y_test_tfidf, y_pred_svm_tfidf)

# Create comparison DataFrame
comparison = pd.DataFrame({
    "Model": ["Logistic Regression (CV)", "Logistic Regression (TF-IDF)",
              "SVM (CV)", "SVM (TF-IDF)"],
    "Accuracy": [lr_cv_acc, lr_tfidf_acc, svm_cv_acc, svm_tfidf_acc]
})

print("\nAccuracy Comparison Table:")
print(comparison)
print("\n")

# Find best performer
best_model = comparison.loc[comparison['Accuracy'].idxmax()]
print(f"Best Model: {best_model['Model']} with Accuracy: {best_model['Accuracy']:.4f}")


Accuracy Comparison Table:
                          Model  Accuracy
0      Logistic Regression (CV)  0.977578
1  Logistic Regression (TF-IDF)  0.942601
2                      SVM (CV)  0.973094
3                  SVM (TF-IDF)  0.967713


Best Model: Logistic Regression (CV) with Accuracy: 0.9776


Top Important Words for Each Class

In [18]:
# Get feature names from CountVectorizer
feature_names_cv = cv.get_feature_names_out()

# Get top 10 important words for CountVectorizer + Logistic Regression
print("\nTop 10 words for SPAM:")
spam_coef_cv = lr_cv.coef_[0]
top_spam_idx_cv = np.argsort(spam_coef_cv)[-10:][::-1]
spam_words_cv = [(feature_names_cv[i], spam_coef_cv[i]) for i in top_spam_idx_cv]
for word, coef in spam_words_cv:
    print(f"  {word}: {coef:.4f}")

print("\nTop 10 words for HAM:")
top_ham_idx_cv = np.argsort(spam_coef_cv)[:10]
ham_words_cv = [(feature_names_cv[i], spam_coef_cv[i]) for i in top_ham_idx_cv]
for word, coef in ham_words_cv:
    print(f"  {word}: {coef:.4f}")

# Get top 10 important words for TF-IDF + Logistic Regression
feature_names_tfidf = tfidf.get_feature_names_out()

print("\n--- TF-IDF + Logistic Regression ---")
print("\nTop 10 words for SPAM:")
spam_coef_tfidf = lr_tfidf.coef_[0]
top_spam_idx_tfidf = np.argsort(spam_coef_tfidf)[-10:][::-1]
spam_words_tfidf = [(feature_names_tfidf[i], spam_coef_tfidf[i]) for i in top_spam_idx_tfidf]
for word, coef in spam_words_tfidf:
    print(f"  {word}: {coef:.4f}")

print("\nTop 10 words for HAM:")
top_ham_idx_tfidf = np.argsort(spam_coef_tfidf)[:10]
ham_words_tfidf = [(feature_names_tfidf[i], spam_coef_tfidf[i]) for i in top_ham_idx_tfidf]
for word, coef in ham_words_tfidf:
    print(f"  {word}: {coef:.4f}")

# Create a visualization comparison DataFrame for top words
print("\n" + "="*60)
print("Summary Comparison of Top Words")
print("="*60)

spam_words_cv_str = ", ".join([word for word, _ in spam_words_cv[:5]])
ham_words_cv_str = ", ".join([word for word, _ in ham_words_cv[:5]])
spam_words_tfidf_str = ", ".join([word for word, _ in spam_words_tfidf[:5]])
ham_words_tfidf_str = ", ".join([word for word, _ in ham_words_tfidf[:5]])

comparison_words = pd.DataFrame({
    "Vectorizer": ["CountVectorizer", "TF-IDF"],
    "Top SPAM Words": [spam_words_cv_str, spam_words_tfidf_str],
    "Top HAM Words": [ham_words_cv_str, ham_words_tfidf_str]
})

print("\n", comparison_words.to_string(index=False))


Top 10 words for SPAM:
  txt: 2.0216
  claim: 1.8766
  new: 1.8254
  mobile: 1.7933
  call: 1.6310
  stop: 1.6266
  ringtone: 1.5908
  reply: 1.5753
  text: 1.5377
  content: 1.4981

Top 10 words for HAM:
  ltgt: -1.2900
  ill: -1.2403
  sir: -1.1124
  later: -0.8519
  way: -0.7645
  da: -0.7407
  ok: -0.7378
  going: -0.7115
  thats: -0.6992
  tell: -0.6917

--- TF-IDF + Logistic Regression ---

Top 10 words for SPAM:
  txt: 4.5242
  claim: 3.8050
  stop: 3.7315
  free: 3.7203
  mobile: 3.6721
  call: 3.2567
  reply: 3.0471
  prize: 2.8420
  text: 2.8410
  service: 2.5914

Top 10 words for HAM:
  ltgt: -2.0101
  im: -1.9468
  ill: -1.8391
  ok: -1.7940
  sir: -1.4992
  come: -1.4440
  later: -1.3841
  da: -1.3138
  going: -1.3102
  got: -1.2706

Summary Comparison of Top Words

      Vectorizer                 Top SPAM Words              Top HAM Words
CountVectorizer  txt, claim, new, mobile, call ltgt, ill, sir, later, way
         TF-IDF txt, claim, stop, free, mobile     ltgt, im,