# Import Relevent Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer


# Import Dataset

In [2]:
raw_data=pd.read_csv("D:\\SLIIT  FOC\\Y3\\Y3 S2\\MLOM\\Dataset\\spam.csv")

# Read and Checking information of the dataset

In [3]:
print(raw_data)

     Label                                          EmailText
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham             Will Ã_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
# replace the null values with a null string
data = raw_data.where((pd.notnull(raw_data)),'')

In [5]:
# printing the first 5 rows of the dataset
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# checking the number of rows and columns in the dataset
data.shape

(5572, 2)

In [7]:
#checking the information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   object
 1   EmailText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# Split the dataset into independent and dependent

In [9]:
X=data.iloc[:,-1].values
print(X)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


In [10]:
Y=data.iloc[:,:-1].values
print(Y)

[['ham']
 ['ham']
 ['spam']
 ...
 ['ham']
 ['ham']
 ['ham']]


# Convert text variable using CountVectorizer class

In [11]:
cvec=CountVectorizer()

In [12]:
cvec.fit(X)

CountVectorizer()

In [13]:
print("Word List: ", cvec.vocabulary_)



In [14]:
X = cvec.transform(X)

In [15]:
print("Encoded Dataset is:")
print(X.toarray())

Encoded Dataset is:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Split dataset into training and testing

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=0)

# Applying SVM Algorithm

In [17]:
model = SVC(kernel = 'rbf', random_state = 0)
model.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


SVC(random_state=0)

# Accuracy

In [20]:
# prediction on training data
acc_train = model.score(X_train,Y_train)
print('Accuracy on training data : ', acc_train)

Accuracy on training data :  0.9950639443571909


In [21]:
#prediction on testing data
acc_test = model.score(X_test,Y_test)
print('Accuracy on testing data : ', acc_test)

Accuracy on testing data :  0.9757847533632287


# Using the trained model, predict whether the emails are spam or ham

In [31]:
#Using the trained model, predict whether the following five emails are spam or ham

X1 = ("Hey, you have won a car !!!!. Conrgratzz")
X2 = ("Dear applicant, Your CV has been recieved. Best regards")
X3 = ("You have received $1000000 to your account")
X4 = ("Join with our whatsapp group")
X5 = ("Kindly check the previous email. Kind Regards")

In [32]:
print(model.predict(cvec.transform([X1])))
print(model.predict(cvec.transform([X2])))
print(model.predict(cvec.transform([X3])))
print(model.predict(cvec.transform([X4])))
print(model.predict(cvec.transform([X5])))

['ham']
['ham']
['ham']
['ham']
['ham']
