In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
spamdata=pd.read_csv('spam.csv')

In [3]:
spamdata.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
#replace null values with null string

In [5]:
data=spamdata.where((pd.notnull(spamdata)), '')

In [6]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
data.shape

(5572, 5)

In [8]:
data=data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [9]:
data.shape

(5572, 2)

In [10]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [13]:
#spam mail as 0 ham as 1
data.loc[data['v1'] == 'spam','v1',] = 0
data.loc[data['v1'] == 'ham','v1',] = 1

In [14]:
data.head()

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
#separate the data as text and label   x-text ,y-label
x=data['v2']
y=data['v1']

In [16]:
x.shape

(5572,)

In [17]:
y.shape

(5572,)

In [18]:
#train test split
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=1)

In [20]:
# feature extraction
#transform the text data to feature vectors that can be used ass inpuut using tfid vectorizer
#also text to lower case
#min_df it makes threshold for the words that are repeating

In [21]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

#also ytrain y test int type
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [22]:
x_train_features

<4457x7468 sparse matrix of type '<class 'numpy.float64'>'
	with 34692 stored elements in Compressed Sparse Row format>

In [28]:
x_test_features

<1115x7468 sparse matrix of type '<class 'numpy.float64'>'
	with 7636 stored elements in Compressed Sparse Row format>

In [24]:
y_train

1642    1
2899    1
480     1
3485    1
157     1
       ..
905     1
5192    1
3980    1
235     1
5157    1
Name: v1, Length: 4457, dtype: int32

In [25]:
y_test

1078    1
4028    1
958     1
4642    1
4674    0
       ..
324     1
1163    1
86      1
4214    1
90      1
Name: v1, Length: 1115, dtype: int32

In [26]:
#train model svm
model=LinearSVC()

In [27]:
model.fit(x_train_features,y_train) 

LinearSVC()

In [34]:
#evaluate the model on train data
y_pred_train=model.predict(x_train_features)

In [35]:
y_pred_train

array([1, 1, 1, ..., 1, 1, 1])

In [36]:
#accuracy score
from sklearn.metrics import accuracy_score

In [37]:
training_accuracy=accuracy_score(y_train,y_pred_train)
training_accuracy

0.9997756338344178

In [38]:
#evaluate the model on test data
y_pred_test=model.predict(x_test_features)

In [39]:
testing_accuracy=accuracy_score(y_test,y_pred_test)
testing_accuracy

0.9928251121076234

In [41]:
#prediction on new input
input_mail=['Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.']

#convert text to feature vectors
input_mail_features=feature_extraction.transform(input_mail)

#making predictions
pred=model.predict(input_mail_features)

if pred[0]==1:
    print("ham mail")
else:
    print("spam mail")

ham mail
