In [1]:
#importing neccessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [2]:
#loading the dataset
df=pd.read_csv("spam.csv",encoding = "ISO-8859-1")

In [3]:
df.shape

(5572, 5)

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
#Considering only the necessary columns
df=df[['v1','v2']]
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
#checking for null values
df.isna().any()

v1    False
v2    False
dtype: bool

In [7]:
df.columns=['Category','Message']

In [8]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
#labelling spam mail as 0 and ham mail as 1
df.loc[df['Category']=='spam','Category']=0
df.loc[df['Category']=='ham','Category']=1
df


Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [14]:
#Splitting dataset into training and testing data
x_train,x_test,y_train,y_test=train_test_split(df['Message'],df['Category'],test_size=0.2,stratify=df['Category'],random_state=1)

In [15]:
x_train.shape

(4457,)

In [16]:
x_test.shape

(1115,)

In [28]:
#Feature Extraction
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

In [29]:
#Set y_train , y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [30]:
#Train model using SVM
model=LinearSVC()
model.fit(x_train_features,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [31]:
#Predicting on testing data
y_predicted=model.predict(x_test_features)

In [35]:
#Evaluating accuracy on testing data
accuracy_svm=accuracy_score(y_test,y_predicted)

In [37]:
accuracy_svm

0.9838565022421525

#### The model gives 98% accuracy using SVM

### Comparing accuracy using other classification algorithms

In [38]:
#using random forest
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier()
model_rf.fit(x_train_features,y_train)
y_predicted=model_rf.predict(x_test_features)
accuracy_rf=accuracy_score(y_test,y_predicted)
accuracy_rf



0.9811659192825112

In [40]:
#Predicting whether a given mail is spam mail or ham mail
mail=[]
mail.append(input("Enter tha mail"))
mail_features=feature_extraction.transform(mail)
result=model.predict(mail_features)
if result==0:
    print('SPAM MAIL')
else:
    print('HAM MAIL')

Enter tha mailHad your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
SPAM MAIL
