<a href="https://colab.research.google.com/github/XeroArk/spam_detect_ML/blob/main/spam_ham_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Collection and Pre-processing

In [2]:
#Dataset from kaggle
raw_mail_data = pd.read_csv('/content/spam_ham_dataset.csv')

In [3]:
# to handle missing columns replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [4]:
mail_data.drop(mail_data.columns[0],axis=1, inplace=True)
mail_data.drop(columns='label_num', axis=1, inplace = True)
print(mail_data)

     label                                               text
0      ham  Subject: enron methanol ; meter # : 988291\r\n...
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...
3     spam  Subject: photoshop , windows , office . cheap ...
4      ham  Subject: re : indian springs\r\nthis deal is t...
...    ...                                                ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...
5169   ham  Subject: industrial worksheets for august 2000...
5170  spam  Subject: important online banking alert\r\ndea...

[5171 rows x 2 columns]


Encoding labels

In [7]:
# spam mail = 0;  ham mail = 1;

mail_data.replace({'label':{'spam':0,'ham':1}}, inplace=True)

In [8]:
# separating the data as texts and label

X = mail_data['text']

Y = mail_data['label']

In [9]:
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [10]:
print(Y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: int64


Test Train splitting of data

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=2)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4653,)
(518,)


In [13]:
# text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
print(X_train)

4113    Subject: meter 5961 , dunagan , j . a . # 1\r\...
1894    Subject: tenaska gas management agreement\r\nd...
688     Subject: online shopping . get it quickly and ...
887     Subject: follow up 1 / 2 day off - site\r\nple...
537     Subject: 52 - quick loan application\r\nhey\r\...
                              ...                        
3335    Subject: \r\nto _ cc _ default _ handler\r\nsu...
1099    Subject: s 709101 - 04 / 03 / 01\r\ndaren , bp...
2514    Subject: viagra _ cialis _ levitra _ ambien _ ...
3606    Subject: panenergy marketing march 2000 produc...
2575    Subject: important information about united he...
Name: text, Length: 4653, dtype: object


In [15]:
print(X_train_features)

  (0, 11364)	0.2459282173456063
  (0, 42384)	0.09830280433722623
  (0, 24356)	0.13258909300609165
  (0, 26462)	0.10908338498243615
  (0, 27294)	0.11762971248040689
  (0, 0)	0.1249558272522672
  (0, 417)	0.12783571264713556
  (0, 19674)	0.15137219676415942
  (0, 14587)	0.2647358968122648
  (0, 3336)	0.27378895377877893
  (0, 31305)	0.13843758412831808
  (0, 1138)	0.24200858502434347
  (0, 15800)	0.3596352879431819
  (0, 24360)	0.23056043333587414
  (0, 14482)	0.11228300884723683
  (0, 16690)	0.3771052569185167
  (0, 2783)	0.3596352879431819
  (0, 29377)	0.3746818984730705
  (0, 41166)	0.043086244971477235
  (1, 19154)	0.281374161286773
  (1, 24506)	0.24469349680022326
  (1, 30103)	0.1307657299834488
  (1, 34921)	0.32393810628690406
  (1, 23251)	0.19396044422238184
  (1, 36498)	0.21543563660130377
  :	:
  (4652, 32102)	0.042482202428627686
  (4652, 4856)	0.03104585706634563
  (4652, 24096)	0.02916373947396463
  (4652, 7160)	0.03323636115480906
  (4652, 38961)	0.052553837670421766
  (4652

Logistic Regression

In [16]:
model = LogisticRegression()

In [18]:
# training the Logistic Regression model
model.fit(X_train_features, Y_train)

LogisticRegression()

In [19]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [20]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9961315280464217


In [21]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [22]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9864864864864865


In [27]:
input_mail = ["Important meeting"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Ham mail
