In [15]:
# Importing important libraries
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import wordcloud
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/priyalagarwal27/E-mail-spam-detection/main/messages.csv'
df = pd.read_csv(url, encoding = 'latin-1')
df.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [17]:
df.dtypes

subject    object
message    object
label       int64
dtype: object

In [18]:
# Count of labels
print('Count of label : ', df['label'].value_counts())

Count of label :  label
0    2412
1     481
Name: count, dtype: int64


In [19]:
# Let us check the proportion of labels
print("Not a Spam Email Ratio i.e. 0 label:",round(len(df[df['label']==0])/len(df['label']),2)*100,"%")
print("Spam Email Ratio that is 1 label:",round(len(df[df['label']==1])/len(df['label']), 2)*100,"%")

Not a Spam Email Ratio i.e. 0 label: 83.0 %
Spam Email Ratio that is 1 label: 17.0 %


In [20]:
# Adding length column to our dataset
df['length'] = df.message.str.len()
df.head()

Unnamed: 0,subject,message,label,length
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,2856
1,,"lang classification grimes , joseph e . and ba...",0,1800
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0,1435
3,risk,a colleague and i are researching the differin...,0,324
4,request book information,earlier this morning i was on the phone with a...,0,1046


In [21]:
# Removing stopwords
stop_words = set(stopwords.words('english') + ['u','ü','ur','4','2','im','dont','doin','ure'])
df['message'] = df['message'].apply(lambda x : ' '.join(term for term in x.split() if term not in stop_words))
df.head()

Unnamed: 0,subject,message,label,length
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,2856
1,,"lang classification grimes , joseph e . barbar...",0,1800
2,query : letter frequencies for text identifica...,posting inquiry sergei atamas ( satamas @ umab...,0,1435
3,risk,colleague researching differing degrees risk p...,0,324
4,request book information,earlier morning phone friend mine living south...,0,1046


In [22]:
# Adding clean_length column to our dataset
df['clean_length'] = df.message.str.len()
df.head()

Unnamed: 0,subject,message,label,length,clean_length
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,2856,2483
1,,"lang classification grimes , joseph e . barbar...",0,1800,1569
2,query : letter frequencies for text identifica...,posting inquiry sergei atamas ( satamas @ umab...,0,1435,1144
3,risk,colleague researching differing degrees risk p...,0,324,220
4,request book information,earlier morning phone friend mine living south...,0,1046,719


In [23]:
# Getting proportion of redundant words removed
df['prop_removed'] = ((df['length'] - df['clean_length']) / df['length']) * 100
df.head()

Unnamed: 0,subject,message,label,length,clean_length,prop_removed
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0,2856,2483,13.060224
1,,"lang classification grimes , joseph e . barbar...",0,1800,1569,12.833333
2,query : letter frequencies for text identifica...,posting inquiry sergei atamas ( satamas @ umab...,0,1435,1144,20.278746
3,risk,colleague researching differing degrees risk p...,0,324,220,32.098765
4,request book information,earlier morning phone friend mine living south...,0,1046,719,31.26195


In [24]:
df['prop_removed'].describe()

count    2893.000000
mean       19.435359
std         7.403646
min         0.399848
25%        14.372163
50%        18.936109
75%        24.521739
max        50.000000
Name: prop_removed, dtype: float64

In [25]:
# About 20% (mean) of the words are getting removed.

In [26]:
# Getting inormation about the length of removed part
print('Original Dataset : ', df.length.sum())
print('Cleaned Dataset : ', df.clean_length.sum())
print('Total Words Removed : ', (df.length.sum()) - df.clean_length.sum())

Original Dataset :  9344743
Cleaned Dataset :  7540590
Total Words Removed :  1804153


In [27]:
# Approximately, 20% of the redundant data is removed in the cleaning process.

In [28]:
# NLP Modelling
tf_vec = TfidfVectorizer()
SVM = SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
features = tf_vec.fit_transform(df['message'])
X = features
y = df['label']

In [29]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=109)

In [30]:
#Train the model using the training sets
SVM.fit(X_train, y_train)

In [31]:
#Predict the response for test dataset
y_pred = SVM.predict(X_test)

In [32]:
# Getting final score
print('Final Score => ', accuracy_score(y_test, y_pred))

Final Score =>  0.9896373056994818
