In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

#### Data Collections and Labelling

In [5]:
df = pd.read_csv("/kaggle/input/email-spam-classification-dataset/combined_data.csv")

In [6]:
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [7]:
df['category'] = ['spam' if x == 1 else 'ham' for x in df['label']]
df.drop(['label'], axis=1, inplace=True)

df.head()

Unnamed: 0,text,category
0,ounce feather bowl hummingbird opec moment ala...,spam
1,wulvob get your medircations online qnb ikud v...,spam
2,computer connection from cnn com wednesday es...,ham
3,university degree obtain a prosperous future m...,spam
4,thanks for all your answers guys i know i shou...,ham


In [9]:
df.shape

(83448, 2)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      83448 non-null  object
 1   category  83448 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [12]:
mail_df = df.where((pd.notnull(df)), '')

In [13]:
mail_df.head()

Unnamed: 0,text,category
0,ounce feather bowl hummingbird opec moment ala...,spam
1,wulvob get your medircations online qnb ikud v...,spam
2,computer connection from cnn com wednesday es...,ham
3,university degree obtain a prosperous future m...,spam
4,thanks for all your answers guys i know i shou...,ham


Label Encoding<br>
spam = 0
ham = 1

In [14]:
mail_df.loc[mail_df['category'] == 'spam', 'category'] = 0
mail_df.loc[mail_df['category'] == 'ham', 'category'] = 1

mail_df.head()

Unnamed: 0,text,category
0,ounce feather bowl hummingbird opec moment ala...,0
1,wulvob get your medircations online qnb ikud v...,0
2,computer connection from cnn com wednesday es...,1
3,university degree obtain a prosperous future m...,0
4,thanks for all your answers guys i know i shou...,1


In [15]:
messages = mail_df['text']
labels = mail_df['category']

#### Training Data and Feature Extractions

In [17]:
x_train, x_test, y_train, y_test = train_test_split(messages, labels, random_state=42, test_size=0.2)

In [20]:
f_extractor = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [21]:
x_train_feature = f_extractor.fit_transform(x_train)
x_test_feature = f_extractor.transform(x_test)

In [23]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [24]:
print(x_train_feature)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6003379 stored elements and shape (66758, 275615)>
  Coords	Values
  (0, 80370)	0.04308073993602593
  (0, 109629)	0.0599984260100465
  (0, 152327)	0.16106675209782118
  (0, 251026)	0.0379486364163786
  (0, 111900)	0.0278103876422694
  (0, 169859)	0.06403640034404481
  (0, 102656)	0.10744769357930192
  (0, 63086)	0.09393985736727985
  (0, 66037)	0.08746678158544523
  (0, 201148)	0.07526380635616738
  (0, 226072)	0.08427799690865131
  (0, 155632)	0.12892015403832685
  (0, 224811)	0.05511589774709196
  (0, 224821)	0.04237771049651244
  (0, 16955)	0.06831275580350303
  (0, 253947)	0.054664984750073586
  (0, 105353)	0.07104833863773659
  (0, 105882)	0.08416530838036419
  (0, 151407)	0.06278484001541294
  (0, 128951)	0.11413168926981787
  (0, 205609)	0.05261826823489937
  (0, 65971)	0.10886728523850649
  (0, 14850)	0.09464479893428995
  (0, 168169)	0.09389687790913874
  (0, 230866)	0.08649208447295934
  :	:
  (66757, 162738)	0.079

#### Model Training & Evaluation

In [25]:
model = LogisticRegression()

In [26]:
model.fit(x_train_feature, y_train)

In [27]:
y_pred = model.predict(x_test_feature)

In [29]:
print('Accuracy Score : ', accuracy_score(y_pred, y_test))

Accuracy Score :  0.9846015578190533


#### Predictive System

In [33]:
def check_if_mail_is_spam(input_data):
    input_data_feature = f_extractor.transform(input_data)
    prediction = model.predict(input_data_feature)
    if prediction[0] == 1:
        print("Email is Ham")
    else:
        print("Email is Spam")

In [34]:
input_data_ham = ['Hi, How are you doing?']
check_if_mail_is_spam(input_data_ham)

Email is Ham


In [35]:
input_data_spam = ['Free, Free, Free, You have won a lottery']
check_if_mail_is_spam(input_data_spam)

Email is Spam


In [36]:
input_data = ['aisoadhlkada JSLAIDHNALD adhahndfa']
check_if_mail_is_spam(input_data)

Email is Spam


In [37]:
input_data = ['Hi, how are you? are you winning the lottery']
check_if_mail_is_spam(input_data)

Email is Spam


In [39]:
input_data = ['Hi, how are you? Please']
check_if_mail_is_spam(input_data)

Email is Ham
