Spam Email Detection

Step 1: Importing all necessary liberaries in order to train the model

In [123]:
#importing all the necessary and required liberaries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # it is used to extract features from text
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Step 2: Data Pre Processing begins from here.

In [124]:
#now loading our raw data into pandas dataframe
raw_data= pd.read_csv('spam.csv')
raw_data[['Unnamed: 2','Unnamed: 3','Unnamed: 4']] = np.nan   #dropping all the columns with Nan Values 
raw_data.dropna(how='all', axis=1, inplace=True)
raw_data.head(5574)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [125]:
#now labeling the data set, spam tag will be labelled as 0 and ham will be labelled as 1

raw_data.loc[raw_data['v1']=='spam','v1']=0
raw_data.loc[raw_data['v1']=='ham','v1']=1
print("Labeling is Done")
raw_data.head(5572)


Labeling is Done


Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will �_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [126]:
#separating the data as text and label  (X----> 'Text')(Y----> 'Label')
X=raw_data['v2']
Y=raw_data['v1']

print(X)

print('...........................................................\n',Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will �_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object
...........................................................
 0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: object


Step 3: Train Test Split

In [127]:
#splitting the raw data into train test split using train test split imported in the first step
#split the data as train data and test data
X_train , X_test , Y_train, Y_test = train_test_split(X , Y , train_size=0.8, test_size=0.2, random_state=3)  
#here train_size and test_size tell us that 80% will be train data and 20% will be test data

Step 4: Feature Extraction begins from here

In [128]:
#transforming the text data into feature vector that can be used as input in SVM model using Tfidefvectorizer

#converting all the text to lower case

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')
X_train_features= feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#converting Y text and y test into integers

Y_train= Y_train.astype('int')
Y_test = Y_test.astype('int')


Training the model ------> Support Vector Machine

In [129]:
model = LinearSVC()
model.fit (X_train_features,Y_train)

LinearSVC()

Evaluation of the Model

In [130]:
#prediction on training data
prediction_on_training_data = model.predict(X_train_features)

accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

print("The accuracy on training data is: ",accuracy_on_training_data)

#The accuracy on training data is not that important because the model has alerady seen the training data

The accuracy on training data is:  0.9995512676688355


In [131]:
#prediction on test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)

print("The accuracy on test data is: ",accuracy_on_test_data)

The accuracy on test data is:  0.9856502242152466


Testing the trained model on new emails which were not in raw data


In [135]:
input_email=['Hi there,Congratulations on winning a GoodLuck jade plant. This is the first part of your Indoor Garden!']
#converting text to feature vector

input_email_features=feature_extraction.transform(input_email)
prediction=model.predict(input_email_features)
prediction

if prediction==0:
    print("Spam")
else:
    print("Not Spam")    

Not Spam
