In [63]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [64]:
#Data Cleaning
data = pd.read_csv('spam.csv', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)

In [66]:
data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
#EDA
data['target'].value_counts()

target
ham     4825
spam     747
Name: count, dtype: int64

In [68]:
# Encoding target column using LabelEncoder
encoder = LabelEncoder()
data['target'] = encoder.fit_transform(data['target'])

In [69]:
data.head(5)

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [70]:
# Using CountVectorizer to transform the 'text' column
x = data['text'].astype(str)
y = data['target'].astype(int)

In [71]:
cv = CountVectorizer()

In [72]:
x_transformed = cv.fit_transform(x)

In [73]:
x.shape

(5572,)

1.The Cat 2.The Dog 3.The Bird

    The Cat Dog Bird
1.   1  1   0    0
2.   1  0   1   0
3.   1  0   0    1

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [75]:
tfidf_vectorizer = TfidfVectorizer()

In [76]:
x_tfidf = tfidf_vectorizer.fit_transform(x)

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

In [78]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [79]:
accuracy = model.score(x_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 96.23%


In [80]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [81]:
with open("spam.pkl", "wb") as file:
    pickle.dump(model, file)

In [82]:
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(cv, file)

In [83]:
with open("spam.pkl", "rb") as file:
    loaded_model = pickle.load(file)

In [84]:
with open("vectorizer.pkl", "rb") as file:
    loaded_vectorizer = pickle.load(file)

In [85]:
msg='07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorr'
data=[msg]
vect = loaded_vectorizer.transform(data)
prediction = loaded_model.predict(vect)
print(f"Predicted: {prediction[0]}")

Predicted: 1
