In [1]:
import pandas as pd
import numpy as np
import string
import nltk

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label', 'message'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
Data cleaning:

In [3]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['cleaned_message'] = df['message'].apply(clean_text)
df[['message', 'cleaned_message']].head()


Unnamed: 0,message,cleaned_message
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_message'],
    df['label'],
    test_size=0.2,
    random_state=42
)


In [None]:
converting text using count vectorizer

In [5]:
count_vec = CountVectorizer()
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
nb_pred = nb.predict(X_test_count)

print("Naive Bayes + CountVectorizer")
print("Accuracy:", accuracy_score(y_test, nb_pred))
print(confusion_matrix(y_test, nb_pred))
print(classification_report(y_test, nb_pred))


Naive Bayes + CountVectorizer
Accuracy: 0.9856502242152466
[[962   4]
 [ 12 137]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.92      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [6]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_count, y_train)
lr_pred = lr.predict(X_test_count)

print("Logistic Regression + CountVectorizer")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))


Logistic Regression + CountVectorizer
Accuracy: 0.9847533632286996
[[966   0]
 [ 17 132]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.99      0.98      0.98      1115



In [7]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
Bonus comparison:

In [8]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)

print("Naive Bayes + TF-IDF")
print("Accuracy:", accuracy_score(y_test, pred_nb_tfidf))


Naive Bayes + TF-IDF
Accuracy: 0.9713004484304932


In [9]:
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

print("Logistic Regression + TF-IDF")
print("Accuracy:", accuracy_score(y_test, pred_lr_tfidf))


Logistic Regression + TF-IDF
Accuracy: 0.9605381165919282


In [None]:
Bonus point:

In [10]:
features = tfidf.get_feature_names_out()
coeffs = lr_tfidf.coef_[0]

spam_words = sorted(zip(coeffs, features), reverse=True)[:10]
ham_words = sorted(zip(coeffs, features))[:10]

print("Top Spam Words:")
for _, word in spam_words:
    print(word)

print("\nTop Ham Words:")
for _, word in ham_words:
    print(word)


Top Spam Words:
txt
stop
claim
free
mobile
call
reply
text
win
service

Top Ham Words:
im
ltgt
ok
ill
later
come
sorry
sir
home
da
