<a href="https://colab.research.google.com/github/SupunGurusinghe/ML-Series/blob/main/support_vector_machine_spam_email_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Import Important Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import svm

### **Load our Dataset**

In [2]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [3]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### **Checking the information of the dataset**

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   object
 1   EmailText  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### **Duplicates handling**

In [5]:
print(data.duplicated().sum())

403


In [8]:
data = data.drop_duplicates(keep='last')
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


### **Checking the information of the dataset**

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5169 non-null   object
 1   EmailText  5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


### **Checking Class counts**

In [10]:
data['Label'].value_counts()

ham     4516
spam     653
Name: Label, dtype: int64

### **Splitting our data into X and y**

In [11]:
X = data['EmailText'].values
y = data['Label'].values

### **Splitting our data into training and testing.**

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)

### **Converting text into integer using CountVectorizer()**

In [13]:
# Converting String to Integer
cv = CountVectorizer() 
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [14]:
print(X_train)

  (0, 7243)	1
  (0, 6826)	1
  (0, 3118)	1
  (0, 6722)	1
  (0, 2362)	1
  (1, 3837)	1
  (1, 3081)	1
  (1, 1208)	1
  (1, 3434)	1
  (2, 3483)	1
  (2, 7382)	1
  (2, 2058)	1
  (2, 4862)	1
  (2, 7570)	1
  (2, 4770)	1
  (3, 4844)	1
  (3, 6717)	1
  (4, 5498)	1
  (4, 6750)	1
  (4, 841)	1
  (4, 2903)	1
  (4, 6677)	1
  (4, 5683)	1
  (4, 7129)	1
  (5, 7265)	1
  :	:
  (4133, 1006)	1
  (4133, 7279)	1
  (4133, 6856)	1
  (4133, 2016)	1
  (4133, 6210)	1
  (4133, 7566)	1
  (4134, 3483)	1
  (4134, 3703)	2
  (4134, 996)	1
  (4134, 7075)	1
  (4134, 3325)	1
  (4134, 7412)	1
  (4134, 4058)	1
  (4134, 960)	1
  (4134, 1180)	1
  (4134, 5983)	1
  (4134, 6205)	1
  (4134, 2819)	1
  (4134, 1473)	1
  (4134, 5440)	1
  (4134, 1105)	1
  (4134, 3222)	1
  (4134, 2114)	1
  (4134, 997)	1
  (4134, 6159)	1


In [15]:
print(X_test)

  (0, 972)	1
  (0, 1060)	1
  (0, 1572)	1
  (0, 2519)	1
  (0, 4810)	1
  (0, 6224)	1
  (0, 6262)	1
  (0, 6704)	2
  (0, 7555)	1
  (0, 7570)	1
  (1, 935)	1
  (1, 2463)	1
  (1, 2740)	1
  (1, 3053)	1
  (1, 3130)	1
  (1, 3156)	1
  (1, 3715)	1
  (1, 4031)	1
  (1, 4352)	1
  (1, 4757)	1
  (1, 4844)	1
  (1, 4868)	1
  (1, 5466)	1
  (1, 6486)	1
  (1, 6746)	1
  :	:
  (1033, 299)	1
  (1033, 364)	1
  (1033, 571)	1
  (1033, 1280)	1
  (1033, 1458)	1
  (1033, 1603)	2
  (1033, 1720)	1
  (1033, 2040)	1
  (1033, 2072)	1
  (1033, 2735)	1
  (1033, 3703)	1
  (1033, 3837)	1
  (1033, 4203)	1
  (1033, 4426)	1
  (1033, 4638)	1
  (1033, 4770)	1
  (1033, 4778)	1
  (1033, 4862)	1
  (1033, 5076)	1
  (1033, 5191)	1
  (1033, 5488)	1
  (1033, 5519)	1
  (1033, 6036)	1
  (1033, 6826)	1
  (1033, 7576)	1


### **Applying SVM algorithm**

In [16]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

SVC(random_state=0)

### **Predicted values**

In [17]:
pred = classifier.predict(X_test)
pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype=object)

### **Accuracy**

In [18]:
print(classifier.score(X_test,y_test))

0.9777562862669246


### **Confusion matrix**

In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, y_test)

array([[908,  22],
       [  1, 103]])

### **Test for given outputs**

#### **Individual Items**

In [20]:
pred_list = ['Hey, you have won a car !!!!. Conrgratzz',
             'Dear applicant, Your CV has been recieved. Best regards',
             'You have received $1000000 to your account',
             'Join with our whatsapp group',
             'Kindly check the previous email. Kind Regards']
pred_list = np.array(pred_list)
classifier.predict(cv.transform(pred_list))

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype=object)

#### **Function to display any result**

In [21]:
def predict_spam(cv, classifier, pred_list):
  df = pd.DataFrame(columns=['Label', 'ItemText']) 
  pred_list_arr = np.array(pred_list)
  result_arr = classifier.predict(cv.transform(pred_list_arr))
  for x in range(0, len(pred_list)):
    result_dict = {}
    result_dict['Label'] = result_arr[x]
    result_dict['ItemText'] = pred_list[x]
    df = df.append(result_dict, ignore_index=True)
  return df

In [22]:
pred_list = ['Hey, you have won a car !!!!. Conrgratzz',
             'Dear applicant, Your CV has been recieved. Best regards',
             'You have received $1000000 to your account',
             'Join with our whatsapp group',
             'Kindly check the previous email. Kind Regards']
predict_spam(cv, classifier, pred_list)

Unnamed: 0,Label,ItemText
0,ham,"Hey, you have won a car !!!!. Conrgratzz"
1,ham,"Dear applicant, Your CV has been recieved. Bes..."
2,ham,You have received $1000000 to your account
3,ham,Join with our whatsapp group
4,ham,Kindly check the previous email. Kind Regards
