In [45]:
#For Data Set Preparation we need this numpy and pandas
import numpy as np
import pandas as pd

In [46]:
#Load The Dataset
data = pd.read_csv("Spam Email Detection - spam.csv")
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [47]:
data.shape

(5572, 5)

In [48]:
#Checking Missing Value
data.isnull().isnull().sum()

v1            0
v2            0
Unnamed: 2    0
Unnamed: 3    0
Unnamed: 4    0
dtype: int64

In [49]:
#To extract V1 & V2
df_data = data[["v1","v2"]]

In [50]:
df_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [51]:
#Target Variable to classify Email Spam or Not
x = df_data['v2']
y = df_data['v1']

In [52]:
x,y

(0       Go until jurong point, crazy.. Available only ...
 1                           Ok lar... Joking wif u oni...
 2       Free entry in 2 a wkly comp to win FA Cup fina...
 3       U dun say so early hor... U c already then say...
 4       Nah I don't think he goes to usf, he lives aro...
                               ...                        
 5567    This is the 2nd time we have tried 2 contact u...
 5568                Will �_ b going to esplanade fr home?
 5569    Pity, * was in mood for that. So...any other s...
 5570    The guy did some bitching but I acted like i'd...
 5571                           Rofl. Its true to its name
 Name: v2, Length: 5572, dtype: object,
 0        ham
 1        ham
 2       spam
 3        ham
 4        ham
         ... 
 5567    spam
 5568     ham
 5569     ham
 5570     ham
 5571     ham
 Name: v1, Length: 5572, dtype: object)

## Feature Extraction from Text

In [53]:
from sklearn.feature_extraction.text import CountVectorizer    
#CountVectorizer is a powerful tool from Scikit-learn library that speeds up this feature extraction process from text.

In [54]:
values = x #we save the input variable into a new variable,values. 
cv = CountVectorizer()
X = cv.fit_transform(values)#The fit_transform ensures that the CountVectorizer completely fits our input dataset and no data point is left out. 
#Therefore, all the raw text will be converted into vectors of numeric values.

In [55]:
X.toarray() #will convert the numeric values into an array of numbers.

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Model Building

In [56]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [58]:
classifier = SVC(kernel='rbf',random_state=0)

In [59]:
classifier.fit(X_train,y_train)      #Fitting The Model

SVC(random_state=0)

In [60]:
#Model Evaluation
classifier.predict(X_test)

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [61]:
comment = ["Ok lar...Joking wif u oni..."]
vect = cv.transform(comment).toarray()

In [62]:
classifier.predict(vect)

array(['ham'], dtype=object)

In [63]:
comment1 = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to eceive entry question(std txt rate)T&C's apply 08452810075over18's"]
vect = cv.transform(comment1).toarray()
classifier.predict(vect)

array(['spam'], dtype=object)

In [64]:
#Accuracy
print("Accuracy of Model",classifier.score(X_test,y_test)*100,"%")

Accuracy of Model 97.48743718592965 %
