In [10]:
import pandas as pd

df = pd.read_csv(r"C:\Users\HP\Downloads\spam.csv", encoding="latin-1")
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [11]:
df = df[['v1', 'v2']]   # keep only two useful columns
df.columns = ['label', 'text']  # rename columns
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df.shape
df.label.value_counts()


label
ham     4825
spam     747
Name: count, dtype: int64

In [13]:
df['label_num'] = df.label.map({'ham':0, 'spam':1})
df.head()


Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
df.head(50)

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [17]:
import re
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

stop_words = {
    'a','an','the','and','or','is','am','are','was','were','be','been','to','for','in','on','at',
    'this','that','of','with','from','as','by','it','you','we','they','he','she','but','if','not'
}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [ps.stem(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)




In [18]:
df['cleaned_text'] = df['text'].apply(clean_text)
df.head()



Unnamed: 0,label,text,label_num,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazi avail onli bugi n ...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,0,u dun say so earli hor u c alreadi then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah i don t think goe usf live around here though


In [19]:
from sklearn.model_selection import train_test_split

X = df['cleaned_text']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((4457,), (1115,))

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)   # limit vocabulary size
tfidf.fit(X_train)


0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [21]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [22]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [23]:
y_pred = model.predict(X_test_tfidf)


In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9704035874439462

Confusion Matrix:
 [[965   0]
 [ 33 117]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.78      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [25]:
import pickle

pickle.dump(model, open("spam_model.pkl", "wb"))
pickle.dump(tfidf, open("spam_vectorizer.pkl", "wb"))


In [26]:
model_loaded = pickle.load(open("spam_model.pkl", "rb"))
vectorizer_loaded = pickle.load(open("spam_vectorizer.pkl", "rb"))

sample = ["Congratulations! You won a free prize!"]
sample_vec = vectorizer_loaded.transform(sample)
model_loaded.predict(sample_vec)


array([1], dtype=int64)