In [14]:
import pandas as pd
import re
import string
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Read CSV file into DataFrame
df = pd.read_csv("./spam.csv")

df.dropna(inplace=True)

df['text'] = df['text'].str.lower()

df['text'] = df['text'].apply(lambda x: re.sub('\d+', '', x))

df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

df['text'] = df['text'].apply(lambda x: x.strip())

df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+', '', x))

stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

stemmer = SnowballStemmer('english')
df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

df.head()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,label,text
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkts st ...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah dont think goe usf live around though


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.4, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

cr = classification_report(y_test, y_pred)
print("Classification Report:\n", cr)

Confusion Matrix:
 [[1930    0]
 [  90  209]]
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1930
        spam       1.00      0.70      0.82       299

    accuracy                           0.96      2229
   macro avg       0.98      0.85      0.90      2229
weighted avg       0.96      0.96      0.96      2229



In [22]:
import joblib
joblib.dump(clf, 'spam_classifier.pkl')

['spam_classifier.pkl']