In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Here I have collecetd dataset from kaggle and loaded it
raw_mail_data = pd.read_csv('completeSpamAssassin.csv')
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [3]:
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data),'')

In [4]:
# Printing first 5 row of mail data
mail_data.head()

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,3,##############################################...,1
4,4,I thought you might like these:\n1) Slim Down ...,1


In [5]:
# here i have seperated the text as texts and label
X = mail_data['Body']
Y = mail_data['Label']

In [6]:
X.head()

0    \nSave up to 70% on Life Insurance.\nWhy Spend...
1    1) Fight The Risk of Cancer!\nhttp://www.adcli...
2    1) Fight The Risk of Cancer!\nhttp://www.adcli...
3    ##############################################...
4    I thought you might like these:\n1) Slim Down ...
Name: Body, dtype: object

In [7]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64

In [8]:
X_Train,X_test,Y_Train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [9]:
X.shape

(6046,)

In [10]:
X_Train.shape

(4836,)

In [11]:
Y_test.shape

(1210,)

In [12]:
# here i have transformed text data to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')

In [13]:
X_train_feature = feature_extraction.fit_transform(X_Train)
X_test_feature = feature_extraction.transform(X_test)

#  now I have convert Y_train and T_test as Integers

Y_Train = Y_Train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
print(X_train_feature)

  (0, 17340)	0.05373084006199439
  (0, 46394)	0.0802723561803126
  (0, 42916)	0.046898076133872243
  (0, 28610)	0.07751842506110863
  (0, 33713)	0.045201709134380805
  (0, 56403)	0.054138255948463616
  (0, 23181)	0.06119916407411781
  (0, 55476)	0.0338746917470082
  (0, 55660)	0.06366731789924956
  (0, 27696)	0.07538231013126052
  (0, 51003)	0.05039655984009047
  (0, 20272)	0.03278623639093958
  (0, 47937)	0.03289872685599931
  (0, 56362)	0.07751842506110863
  (0, 43632)	0.03969384689597285
  (0, 47360)	0.06312015021077792
  (0, 49748)	0.05761228797236997
  (0, 23799)	0.0527822598529593
  (0, 3006)	0.03634874380302305
  (0, 13161)	0.07216131914185978
  (0, 53890)	0.047820237311286054
  (0, 19465)	0.033127752813622284
  (0, 24114)	0.04250933982489906
  (0, 59374)	0.035935921066130284
  (0, 30503)	0.04814848072839124
  :	:
  (4833, 24984)	0.0543798635051402
  (4833, 53867)	0.0894899694892194
  (4833, 56915)	0.06321845963208526
  (4833, 48030)	0.1192881197373573
  (4833, 41090)	0.07795852

In [15]:
X_Train

2738    \nI think what you're looking at with the dual...
2054    http://www.guardian.co.uk/international/story/...
5205    On Mon, 2002-07-22 at 17:19, Joseph S. Barrera...
5577    On Wed, 2002-07-31 at 06:34, John Hinsley wrot...
6009    \nCore Java Technologies Newsletter\nCODE {col...
                              ...                        
968     ReliaQuote - Save Up To 70% On Life Insurance1...
1667                                                empty
3321    HiI've just installed SpamAssassin and relevan...
1688                                                empty
5994                                                empty
Name: Body, Length: 4836, dtype: object

In [16]:
# Logistic Regression
model = LogisticRegression()

In [17]:
Y_Train

2738    0
2054    0
5205    0
5577    0
6009    0
       ..
968     1
1667    1
3321    0
1688    1
5994    0
Name: Label, Length: 4836, dtype: int32

In [18]:
model.fit(X_train_feature,Y_Train)

In [19]:
# Here I am evaluating the Trained Model
# And now predition on Training Model
prediction_on_Training_Data = model.predict(X_train_feature)
accuracy_on_training_data = accuracy_score(Y_Train,prediction_on_Training_Data)

In [20]:
print("Accuracy for Training : ",accuracy_on_training_data * 100)

Accuracy for Training :  94.04466501240695


In [21]:
# Predict on Test Data
prediction_on_Test_Data = model.predict(X_test_feature)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_Test_Data)

In [22]:
print("Accuracy for Training : ",accuracy_on_test_data * 100)

Accuracy for Training :  92.14876033057851


In [23]:
#  Now building a Predictable System
input_mail = input(f"Enter the message to be predicted")

# Now converting Text to feature vectors
input_data_feature = feature_extraction.transform([input_mail])

# Now making Prediction
prediction = model.predict(input_data_feature)

print(prediction)

if(prediction == [1]):
    print("This is the Ham Mail.")
else:
    print("This is the Spam Mail.")

Enter the message to be predictedAap jaise entertainment lover ke liye 16+ OTT ka pack! Anand lein live TV,movies,web series aur adhik SonyLIV,Eros now,LionsgatePlay par,aur bhi bahut kuch Airtel Xstream App par. Aaj hi download karein
[0]
This is the Spam Mail.
