In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
datapath="F:/college/bca_college/sixth sem/project/code/mail_data.csv"
dataset=pd.read_csv(datapath)
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
datapath="F:/college/bca_college/sixth sem/project/code/data_files/spam_Emails_data.csv"
dataset=pd.read_csv(datapath)
dataset["Category"]=dataset["label"]
dataset["Message"]=dataset["text"]
dataset.drop(["label", "text"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Category,Message
0,Spam,viiiiiiagraaaa\nonly for the ones that want to...
1,Ham,got ice thought look az original message ice o...
2,Spam,yo ur wom an ne eds an escapenumber in ch ma n...
3,Spam,start increasing your odds of success & live s...
4,Ham,author jra date escapenumber escapenumber esca...


In [4]:
duplicates=dataset.duplicated().sum()
print(duplicates)
dataset=dataset.drop_duplicates()
dataset.duplicated().sum()

0


np.int64(0)

In [5]:
dataset.isna().sum()
dataset=dataset.dropna()
dataset.isna().sum()

Category    0
Message     0
dtype: int64

In [6]:
print(dataset.shape)
dataset.info()

(193850, 2)
<class 'pandas.core.frame.DataFrame'>
Index: 193850 entries, 0 to 193851
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Category  193850 non-null  object
 1   Message   193850 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


In [7]:
x=dataset['Message']
y=np.array([1 if (i.lower()=="ham") else 0 for i in dataset['Category']])

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((155080,), (38770,), (155080,), (38770,))

In [9]:
x_test_sample=x_test[:7]
y_test_sample=y_test[:7]
x_test_sample

86909     ets intranet consolidation project updates ( w...
184999    article written tamim ansary afghani american ...
4830      kevin attached electricity price discovery res...
1224      cnat calm possible sir original message maggi ...
25840     there are now two house hearnigs tomorrow not ...
6540      elizabeth ,\nhas contract expired ? unable to ...
99725     as a business you have been preapproved to rec...
Name: Message, dtype: object

In [10]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True,max_features=5000)

x_train=feature_extraction.fit_transform(x_train)
x_test=feature_extraction.transform(x_test)

y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [11]:
x_train,y_train,x_test,y_test

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 10273791 stored elements and shape (155080, 5000)>,
 array([1, 1, 1, ..., 1, 0, 1], shape=(155080,)),
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 2531004 stored elements and shape (38770, 5000)>,
 array([1, 1, 1, ..., 0, 0, 1], shape=(38770,)))

In [12]:
classifier=LogisticRegression(max_iter=148)
classifier.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,148


In [13]:
y_pred=classifier.predict(x_test)
y_pred

array([1, 1, 1, ..., 0, 0, 1], shape=(38770,))

In [None]:
print('Logistic Regression')
print('\tPrecision:',precision_score(y_test,y_pred))
print('\tRecall:',recall_score(y_test,y_pred))
print('\tF1:',f1_score(y_test,y_pred))
print('\tAccuracy:',accuracy_score(y_test,y_pred))
cm=confusion_matrix(y_test,y_pred)

sns.heatmap(cm,cmap='coolwarm',annot=True)
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('confusion matrix')

Logistic Regression
	Precision: 0.975303142786439
	Recall: 0.969926662400945
	F1: 0.9726074724840827
	Accuracy: 0.9713696156822286


array([[17954,   499],
       [  611, 19706]])

In [None]:
dt_clf=DecisionTreeClassifier()
dt_clf.fit(x_train,y_train)
y_pred_dt=dt_clf.predict(x_test)
print('Decision Tree')
print('\tPrecision:',precision_score(y_test,y_pred_dt))
print('\tRecall:',recall_score(y_test,y_pred_dt))
print('\tF1:',f1_score(y_test,y_pred_dt))
print('\tAccuracy:',accuracy_score(y_test,y_pred_dt))
cm=confusion_matrix(y_test,y_pred_dt)

sns.heatmap(cm,cmap='coolwarm',annot=True)
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('confusion matrix')

Decision Tree
	Precision: 0.9681350954478708
	Recall: 0.9735197125559876
	F1: 0.9708199376641226
	Accuracy: 0.969331957699252


array([[17802,   651],
       [  538, 19779]])

In [None]:
rf_clf=RandomForestClassifier(n_estimators=20)
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)
print('Random Forest')
print('\tPrecision:',precision_score(y_test,y_pred_rf))
print('\tRecall:',recall_score(y_test,y_pred_rf))
print('\tF1:',f1_score(y_test,y_pred_rf))
print('\tAccuracy:',accuracy_score(y_test,y_pred_rf))
cm=confusion_matrix(y_test,y_pred_rf)

sns.heatmap(cm,cmap='coolwarm',annot=True)
plt.xlabel('Predictions')
plt.ylabel('Ground Truth')
plt.title('confusion matrix')

Random Forest
	Precision: 0.9831182215242675
	Recall: 0.9860215583009303
	F1: 0.9845677495453875
	Accuracy: 0.9838019086922879


array([[18109,   344],
       [  284, 20033]])

In [17]:
x_test_sample
y_test_sample

x_test_converted=feature_extraction.transform(x_test_sample)
y_test_converted=y_test_sample.astype('int')

y_test_pred=rf_clf.predict(x_test_converted)

pd.DataFrame({
    'y_test':y_test_sample,
    'y_pred':y_test_pred
}).head(20)

Unnamed: 0,y_test,y_pred
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,0,0


In [18]:
"""import joblib

joblib.dump(rf_clf,'rf_model.pkl')
joblib.dump(feature_extraction,'tfidf_vectorizer.pkl')"""

"import joblib\n\njoblib.dump(rf_clf,'rf_model.pkl')\njoblib.dump(feature_extraction,'tfidf_vectorizer.pkl')"