In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

In [238]:
datapath="F:/college/bca_college/sixth sem/project/code/backend/email_dataset.csv"
df=pd.read_csv(datapath)
df.head()

Unnamed: 0,From,Subject,Body,SenderDomain,HasURL,NumURLs,NumCC,NumBCC,Label
0,startnow2002@hotmail.com,[ILUG] STOP THE MLM INSANITY,Greetings!\n\nYou are receiving this letter be...,hotmail.com,1,1,0,0,spam
1,sales@outsrc-em.com,New Product Announcement,NEW PRODUCT ANNOUNCEMENT\n\nFrom: OUTSOURCE EN...,outsrc-em.com,1,1,0,0,spam
2,ormlh@imail.ru,FW:,\n<HTML>\n<BODY bgColor=#C0C0C0>\n\n<FONT face...,imail.ru,0,0,0,0,spam
3,douglassmith2004@yahoo.co.uk,[SA] URGENT HELP..............,"\nDEAR SIR,\nURGENT AND CONFIDENTIAL:\n\nRe:Tr...",yahoo.co.uk,0,0,0,0,spam
4,dockut2@hotmail.com,Hello !,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",hotmail.com,1,1,0,0,spam


In [239]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4166 entries, 0 to 4165
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   From          4161 non-null   object
 1   Subject       4153 non-null   object
 2   Body          4124 non-null   object
 3   SenderDomain  4161 non-null   object
 4   HasURL        4166 non-null   int64 
 5   NumURLs       4166 non-null   int64 
 6   NumCC         4166 non-null   int64 
 7   NumBCC        4166 non-null   int64 
 8   Label         4166 non-null   object
dtypes: int64(4), object(5)
memory usage: 293.1+ KB


In [240]:
df.describe()

Unnamed: 0,HasURL,NumURLs,NumCC,NumBCC
count,4166.0,4166.0,4166.0,4166.0
mean,0.81085,7.321411,1.058089,0.0
std,0.391675,53.75388,5.981724,0.0
min,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,0.0
50%,1.0,2.0,0.0,0.0
75%,1.0,3.0,0.0,0.0
max,1.0,3133.0,97.0,0.0


In [241]:
df=df.drop(columns=["NumBCC"])

In [242]:
print(df.duplicated().sum())
df=df.drop_duplicates()
df.duplicated().sum()

200


np.int64(0)

In [243]:
df.isna().sum()

From             5
Subject         12
Body            39
SenderDomain     5
HasURL           0
NumURLs          0
NumCC            0
Label            0
dtype: int64

In [244]:
df=df.dropna(subset=["From"])

In [245]:
df.isna().sum()

From             0
Subject         11
Body            39
SenderDomain     0
HasURL           0
NumURLs          0
NumCC            0
Label            0
dtype: int64

In [246]:
df["Subject"]=np.where(df["Subject"].isna(),"",df["Subject"])
df["Body"]=np.where(df["Body"].isna(),"",df["Body"])

In [247]:
df.isna().sum()

From            0
Subject         0
Body            0
SenderDomain    0
HasURL          0
NumURLs         0
NumCC           0
Label           0
dtype: int64

In [248]:
x=df.drop('Label',axis=1)
y=np.array([1 if lbl=="spam" else 0 for lbl in df["Label"]])

In [249]:
vec_from=TfidfVectorizer()
vec_domain=TfidfVectorizer()
vec_sub=TfidfVectorizer()
vec_body=TfidfVectorizer(max_features=5000)

In [250]:
x_from=vec_from.fit_transform(x["From"])
x_sub=vec_sub.fit_transform(x["Subject"])
x_domain=vec_domain.fit_transform(x["SenderDomain"])
x_body=vec_body.fit_transform(x["Body"])

In [251]:
numeric_features = x[['NumURLs', 'NumCC']].values
has_url=x[["HasURL"]].values
scaler=StandardScaler()
x_numeric = scaler.fit_transform(numeric_features)
x = hstack([x_from, x_sub, x_body, x_domain, x_numeric,has_url])


In [252]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((3168, 14569), (793, 14569), (3168,), (793,))

In [253]:
lr_clf=LogisticRegression(max_iter=148)
lr_clf.fit(x_train,y_train)
y_pred=lr_clf.predict(x_test)
print('Logistic Regression')
print('\tPrecision:',precision_score(y_test,y_pred))
print('\tRecall:',recall_score(y_test,y_pred))
print('\tF1:',f1_score(y_test,y_pred))

Logistic Regression
	Precision: 0.978494623655914
	Recall: 0.9680851063829787
	F1: 0.9732620320855615


In [254]:
dt_clf=DecisionTreeClassifier()
dt_clf.fit(x_train,y_train)
y_pred_dt=dt_clf.predict(x_test)
print('Decision Tree')
print('\tPrecision:',precision_score(y_test,y_pred_dt))
print('\tRecall:',recall_score(y_test,y_pred_dt))
print('\tF1:',f1_score(y_test,y_pred_dt))

Decision Tree
	Precision: 0.9169675090252708
	Recall: 0.900709219858156
	F1: 0.9087656529516994


In [255]:
rf_clf=RandomForestClassifier(n_estimators=140)
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)
print('Random Forest')
print('\tPrecision:',precision_score(y_test,y_pred_rf))
print('\tRecall:',recall_score(y_test,y_pred_rf))
print('\tF1:',f1_score(y_test,y_pred_rf))

Random Forest
	Precision: 0.9710144927536232
	Recall: 0.950354609929078
	F1: 0.9605734767025089


In [None]:
'''import joblib

joblib.dump(lr_clf,'email_model.pkl')
joblib.dump(vec_sub,'sub_vectorizer.pkl')
joblib.dump(vec_domain,'domain_vectorizer.pkl')
joblib.dump(vec_body,'body_vectorizer.pkl')
joblib.dump(vec_from,'from_vectorizer.pkl')
joblib.dump(scaler,'email_scaler.pkl')'''

['email_scaler.pkl']