#   Spam email detection 

In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

  ## Impliment Train test split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.Category, test_size=0.2, random_state=1)

In [12]:
X_train.shape

(4457,)

In [13]:
X_test.shape

(1115,)

In [14]:
X_train.values

array(["Hi , where are you? We're at  and they're not keen to go out i kind of am but feel i shouldn't so can we go out tomo, don't mind do you?",
       'If you r @ home then come down within 5 min',
       "When're you guys getting back? G said you were thinking about not staying for mcr",
       ...,
       'CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C YA 2MORO! WHO NEEDS BLOKES',
       'Text & meet someone sexy today. U can find a date or even flirt its up to U. Join 4 just 10p. REPLY with NAME & AGE eg Sam 25. 18 -msg recd@thirtyeight pence',
       'K k:) sms chat with me.'], dtype=object)

## Create bag of words representation using CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

In [16]:
X_train_cv = v.fit_transform(X_train.values)
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7711)

In [18]:
v.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'zyada', 'èn', '〨ud'],
      dtype=object)

## Train the naive bayes model

In [19]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [20]:
X_test_cv = v.transform(X_test)
X_test_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
model.score(X_test_cv, y_test)

0.989237668161435

In [22]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       968
        spam       0.98      0.94      0.96       147

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



## Train the model using sklearn pipeline

In [23]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [24]:
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       968
        spam       0.98      0.94      0.96       147

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



# Model convert into Pickle file and test the Model

In [30]:
import pickle

pickle.dump(model,open('model2.pkl','wb'))
pickle.dump(v,open('vector.pkl','wb'))

In [31]:
modeltest = pickle.load(open('model2.pkl','rb'))
vectortest = pickle.load(open('vector.pkl','rb'))


In [34]:
vector = vectortest.transform(['Free entry in 2 a wkly comp to win FA Cup fina'])
resultspam = modeltest.predict(vector)[0]
resultspam

'spam'