# Email Spam /Not Spam

# Import Libraries

In [5]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [49]:
data=pd.read_csv(r"spam.csv")

In [51]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data preprocessing

In [54]:
# Download nltk.download
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Varsha
[nltk_data]     Arun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
stop_words=set(stopwords.words("english"))

# Tokenization and cleaning

In [59]:
data['Message']=data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha())) # Apply will take each row and lambda function. X in input
#.join will join throught as space

In [60]:
data.head()

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i do think he goes to usf he lives around ...


In [67]:
# stop word removal:
data['Message']=data['Message'].apply(lambda x:' '.join(word for word in x.split() if word not in stop_words))

In [69]:
data.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though


# Feature Extraction

In [176]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [178]:
vectorizer=TfidfVectorizer()

In [180]:
vectorizer.fit(data['Message'])

In [182]:
X=vectorizer.transform(data['Message']) # fit will learn data and will transfrom what you have learned. Tfidf will convert string into numerical form

In [132]:
X.get_shape()

(5572, 7198)

In [134]:
# split data in train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,data['Category'], test_size=0.2, random_state=42)

# Build And TRain Model

In [137]:
from sklearn.naive_bayes import MultinomialNB

In [139]:
model=MultinomialNB()
model.fit(X_train,y_train)

In [141]:
y_pred=model.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

# REsult`

In [204]:
from sklearn.metrics import accuracy_score, classification_report
print('Accuracy: ', accuracy_score(y_test,y_pred))

Accuracy:  0.9713004484304932


In [146]:
print('Classification report: \n\n', classification_report(y_test,y_pred))

Classification report: 

               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



# Example

In [254]:
m1="Lottery winner winner you should must invest"
#m1="Your Sunday morning trip with Uber"
m1="Shruti is a good girl"

In [256]:
m1=' '.join(word.lower() for word in word_tokenize(m1) if word.isalpha())

In [258]:
m1=' '.join(word for word in m1.split() if word not in stop_words)

In [260]:
m1

'shruti good girl'

In [262]:
m2=vectorizer.transform([m1])
m2

<1x7198 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [264]:
model.predict(m2)[0]

'ham'