# Email Spam Detection

In [45]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('spam.csv')
df.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [8]:
df['Category'] = df.Category.apply(lambda x:1 if x=='spam' else 0)
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [26]:
mdf = df.drop('Category',axis='columns')
X_train,X_test,Y_train,Y_test = train_test_split(mdf.Message,df.Category,test_size=0.2)

In [30]:
X_train.values

array(['Babe ? I lost you ... Will you try rebooting ?',
       'Message from . I am at Truro Hospital on ext. You can phone me here. as I have a phone by my side',
       "I'm meeting Darren...", ..., 'Anything lar...',
       'PRIVATE! Your 2003 Account Statement for shows 800 un-redeemed S. I. M. points. Call 08718738002 Identifier Code: 48922 Expires 21/11/04',
       'Ic. There are a lotta childporn cars then.'], dtype=object)

In [31]:
cv = CountVectorizer()
count_vector = cv.fit_transform(X_train.values)
count_vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [46]:
model = MultinomialNB()
model.fit(count_vector.toarray(),Y_train)

In [36]:
emails = [
    'Hey Mehul , Lets watch Movie',
    'Flat 50% Discount , Buy Now, Dont Miss Reward!'
]
count_vetor_ip=  cv.transform(emails)
count_vetor_ip.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
model.predict(count_vetor_ip.toarray())

array([0, 1], dtype=int64)

In [48]:
model.score(cv.transform(X_test.values).toarray(),Y_test)

0.9883408071748879

# Pipeline

In [49]:
from sklearn.pipeline import Pipeline

In [50]:
X_train

3929       Babe ? I lost you ... Will you try rebooting ?
1559    Message from . I am at Truro Hospital on ext. ...
2406                                I'm meeting Darren...
3209    Oops my phone died and I didn't even know. Yea...
5471                                                  Yup
                              ...                        
3277    Lol your right. What diet? Everyday I cheat an...
4125    Hey sexy buns ! Have I told you ? I adore you,...
2462                                      Anything lar...
4658    PRIVATE! Your 2003 Account Statement for shows...
5546           Ic. There are a lotta childporn cars then.
Name: Message, Length: 4457, dtype: object

In [51]:
pipeline = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])
pipeline.fit(X_train,Y_train)

In [52]:
pipeline.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [53]:
pipeline.score(X_test,Y_test)

0.9883408071748879

# Excercise

In [54]:
from sklearn.datasets import load_wine

In [56]:
wines = load_wine()
wines

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

# Excercise