1. Basic Text Classification Project

In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lalit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

df['label'] = df['label'].map({'ham':0, 'spam':1})
df.head()


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42
)


In [6]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


In [7]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)
pred_nb = nb.predict(X_test_cv)


In [9]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_cv, y_train)
pred_lr = lr.predict(X_test_cv)


In [11]:
svm = SVC()
svm.fit(X_train_cv, y_train)
pred_svm = svm.predict(X_test_cv)


In [12]:
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_lr))
print("Classification Report:\n", classification_report(y_test, pred_lr))


Accuracy: 0.9775784753363229
Confusion Matrix:
 [[965   0]
 [ 25 125]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
#Top Imp words
feature_names = cv.get_feature_names_out()
coefficients = lr.coef_[0]
top_spam = np.argsort(coefficients)[-10:]
top_ham = np.argsort(coefficients)[:10]
print("Top Spam Words:")
print([feature_names[i] for i in top_spam])
print("Top Ham Words:")
print([feature_names[i] for i in top_ham])


Top Spam Words:
['content', 'text', 'reply', 'ringtone', 'stop', 'call', 'mobile', 'new', 'claim', 'txt']
Top Ham Words:
['ltgt', 'ill', 'sir', 'later', 'way', 'ok', 'da', 'going', 'thats', 'tell']
