In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# To convert this to readable file for ML model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('/kaggle/input/spammaildataset/spam_assassin.csv')

In [3]:
print(raw_mail_data)

                                                   text  target
0     From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1     From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2     From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3     From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4     From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0
...                                                 ...     ...
5791  From ilug-admin@linux.ie Mon Jul 22 18:12:45 2...       0
5792  From fork-admin@xent.com Mon Oct 7 20:37:02 20...       0
5793  Received: from hq.pro-ns.net (localhost [127.0...       1
5794  From razor-users-admin@lists.sourceforge.net T...       0
5795  From rssfeeds@jmason.org Mon Sep 30 13:44:10 2...       0

[5796 rows x 2 columns]


In [4]:
# Replace the null value with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
# where(condition, other='') dataframe or series method
# used for conditional data replacement.

# This is checking for notnull data in raw_ma
# replacing it with empty string

In [5]:
mail_data.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [6]:
mail_data.shape

(5796, 2)

In [7]:
## If the target columns was not in the form of 0's and 1's then we can make it in that format
# mail_data.loc[mail_data['target'] == 'spam', 'target',] = 0
# similarly for not spam

In [8]:
X = mail_data['text']

Y = mail_data['target']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [10]:
print(X_train.size)

4636


In [11]:
# transform the text data to feature vectors that can be used as the input to the  Logistic Regression
# X_train = [str (item) for item in X_train]
# X_test = [str (item) for item in X_test]
feature_extraction = TfidfVectorizer(stop_words=None, lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# converting Y_train and Y_test values as integers as they may be in object or some other fomrant

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [12]:
print(X_train_features)

  (0, 54077)	0.05776929993053345
  (0, 54457)	0.053192124624165144
  (0, 43290)	0.04225562533590509
  (0, 66911)	0.03226900509367696
  (0, 109370)	0.06840780262273084
  (0, 109855)	0.04133166801394623
  (0, 91055)	0.054326077427411745
  (0, 40086)	0.0435160127066486
  (0, 53714)	0.025164549637550573
  (0, 34688)	0.05355354221389247
  (0, 83007)	0.01957026422527364
  (0, 51324)	0.05284546295749231
  (0, 43340)	0.03952472704382388
  (0, 46851)	0.017825145348726243
  (0, 98007)	0.02470746900978751
  (0, 112700)	0.012436927554659508
  (0, 94479)	0.05563512986876542
  (0, 104213)	0.032991498727026586
  (0, 99539)	0.0521919073656437
  (0, 103880)	0.043630001419223984
  (0, 75934)	0.04974890212104342
  (0, 82998)	0.0281878558242218
  (0, 30980)	0.034493999888394286
  (0, 79647)	0.031142584026251075
  (0, 109602)	0.02633772667488059
  :	:
  (4635, 70900)	0.014959235899565911
  (4635, 102961)	0.020713258478242142
  (4635, 102898)	0.023931714591802978
  (4635, 81755)	0.03664141188543058
  (4635,

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train_features, Y_train)

In [15]:
# predictions of training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)


In [16]:
print("Accuracy on training data: ",accuracy_on_training_data)

Accuracy on training data:  0.9846850733390854


In [17]:
# predictions of testing data

prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(Y_test, prediction_on_testing_data)


In [18]:
print("Accuracy on testing data: ",accuracy_on_testing_data)

Accuracy on testing data:  0.978448275862069


In [19]:
input_mail = ["From rpm-list-admin@freshrpms.net Thu Jul 25 11:07:39 2002 Return-Path: <rpm-zzzlist-admin@freshrpms.net> Delivered-To: yyyy@localhost.netnoteinc.com Received: from localhost (localhost [127.0.0.1]) by phobos.labs.netnoteinc.com (Postfix) with ESMTP id EEA6E440DE for <jm@localhost>; Thu, 25 Jul 2002 06:06:57 -0400 (EDT) Received: from phobos [127.0.0.1] by localhost with IMAP (fetchmail-5.9.0) for jm@localhost (single-drop); Thu, 25 Jul 2002 11:06:57 +0100 (IST) Received: from egwn.net (ns2.egwn.net [193.172.5.4]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g6P9DX410161 for <jm-rpm@jmason.org>; Thu, 25 Jul 2002 10:13:33 +0100 Received: from auth02.nl.egwn.net (localhost [127.0.0.1]) by egwn.net (8.11.6/8.11.6/EGWN) with ESMTP id g6P982C17538; Thu, 25 Jul 2002 11:08:02 +0200 Received: from python (gw01.es3.egwn.net [212.9.66.13]) (authenticated) by egwn.net (8.11.6/8.11.6/EGWN) with ESMTP id g6P97FC17276 for <rpm-list@freshrpms.net>; Thu, 25 Jul 2002 11:07:16 +0200 From: Matthias Saou <matthias@egwn.net> To: rpm-zzzlist@freshrpms.net Subject: Sylpheed with GNOME 2 (was Re: Ximian apt repos?) Message-Id: <20020725110700.3797c42e.matthias@egwn.net> In-Reply-To: <1027579516.15921.18.camel@localhost.localdomain> References: <20020724.FRs.59646600@www.dudex.net> <1027537563.18947.12.camel@fuggles> <1027579516.15921.18.camel@localhost.localdomain> Organization: Electronic Group Interactive X-Mailer: Sylpheed version 0.8.0claws (GTK+ 1.2.10; i386-redhat-linux) Reply-BY: Tue, 24 Jul 2000 19:02:00 +1000 X-Operating-System: GNU/Linux power! X-Message-Flag: Try using a real operating system : GNU/Linux power! MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-Mailscanner: Found to be clean, Found to be clean Sender: rpm-zzzlist-admin@freshrpms.net Errors-To: rpm-zzzlist-admin@freshrpms.net X-Beenthere: rpm-zzzlist@freshrpms.net X-Mailman-Version: 2.0.11 Precedence: bulk Reply-To: rpm-zzzlist@freshrpms.net List-Help: <mailto:rpm-zzzlist-request@freshrpms.net?subject=help> List-Post: <mailto:rpm-zzzlist@freshrpms.net> List-Subscribe: <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, <mailto:rpm-list-request@freshrpms.net?subject=subscribe> List-Id: Freshrpms RPM discussion list <rpm-zzzlist.freshrpms.net> List-Unsubscribe: <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, <mailto:rpm-list-request@freshrpms.net?subject=unsubscribe> List-Archive: <http://lists.freshrpms.net/pipermail/rpm-zzzlist/> X-Original-Date: Thu, 25 Jul 2002 11:07:00 +0200 Date: Thu, 25 Jul 2002 11:07:00 +0200 Once upon a time, Lance wrote : > Well, from the looks of things, I can import all my mail settings from > Evolution to Sylpheed. Has anyone successfully run Sylpheed in Gnome > 2.0? I noticed with 'apt-get install gnome-session'"]

#Converting mail to feature vector
input_data_feature = feature_extraction.transform(input_mail)

#Predicting

prediction = model.predict(input_data_feature);

if(prediction[0] == 1):
  print("Spam Mail")
else:
  print("Ham mail")


Ham mail


In [20]:
import pickle

In [21]:
filename = 'trained_model.sav'

with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [22]:
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

In [23]:
input_mail = ["From rpm-list-admin@freshrpms.net Thu Jul 25 11:07:39 2002 Return-Path: <rpm-zzzlist-admin@freshrpms.net> Delivered-To: yyyy@localhost.netnoteinc.com Received: from localhost (localhost [127.0.0.1]) by phobos.labs.netnoteinc.com (Postfix) with ESMTP id EEA6E440DE for <jm@localhost>; Thu, 25 Jul 2002 06:06:57 -0400 (EDT) Received: from phobos [127.0.0.1] by localhost with IMAP (fetchmail-5.9.0) for jm@localhost (single-drop); Thu, 25 Jul 2002 11:06:57 +0100 (IST) Received: from egwn.net (ns2.egwn.net [193.172.5.4]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g6P9DX410161 for <jm-rpm@jmason.org>; Thu, 25 Jul 2002 10:13:33 +0100 Received: from auth02.nl.egwn.net (localhost [127.0.0.1]) by egwn.net (8.11.6/8.11.6/EGWN) with ESMTP id g6P982C17538; Thu, 25 Jul 2002 11:08:02 +0200 Received: from python (gw01.es3.egwn.net [212.9.66.13]) (authenticated) by egwn.net (8.11.6/8.11.6/EGWN) with ESMTP id g6P97FC17276 for <rpm-list@freshrpms.net>; Thu, 25 Jul 2002 11:07:16 +0200 From: Matthias Saou <matthias@egwn.net> To: rpm-zzzlist@freshrpms.net Subject: Sylpheed with GNOME 2 (was Re: Ximian apt repos?) Message-Id: <20020725110700.3797c42e.matthias@egwn.net> In-Reply-To: <1027579516.15921.18.camel@localhost.localdomain> References: <20020724.FRs.59646600@www.dudex.net> <1027537563.18947.12.camel@fuggles> <1027579516.15921.18.camel@localhost.localdomain> Organization: Electronic Group Interactive X-Mailer: Sylpheed version 0.8.0claws (GTK+ 1.2.10; i386-redhat-linux) Reply-BY: Tue, 24 Jul 2000 19:02:00 +1000 X-Operating-System: GNU/Linux power! X-Message-Flag: Try using a real operating system : GNU/Linux power! MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit X-Mailscanner: Found to be clean, Found to be clean Sender: rpm-zzzlist-admin@freshrpms.net Errors-To: rpm-zzzlist-admin@freshrpms.net X-Beenthere: rpm-zzzlist@freshrpms.net X-Mailman-Version: 2.0.11 Precedence: bulk Reply-To: rpm-zzzlist@freshrpms.net List-Help: <mailto:rpm-zzzlist-request@freshrpms.net?subject=help> List-Post: <mailto:rpm-zzzlist@freshrpms.net> List-Subscribe: <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, <mailto:rpm-list-request@freshrpms.net?subject=subscribe> List-Id: Freshrpms RPM discussion list <rpm-zzzlist.freshrpms.net> List-Unsubscribe: <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, <mailto:rpm-list-request@freshrpms.net?subject=unsubscribe> List-Archive: <http://lists.freshrpms.net/pipermail/rpm-zzzlist/> X-Original-Date: Thu, 25 Jul 2002 11:07:00 +0200 Date: Thu, 25 Jul 2002 11:07:00 +0200 Once upon a time, Lance wrote : > Well, from the looks of things, I can import all my mail settings from > Evolution to Sylpheed. Has anyone successfully run Sylpheed in Gnome > 2.0? I noticed with 'apt-get install gnome-session'"]

#Converting mail to feature vector
input_data_feature = feature_extraction.transform(input_mail)
#Predicting

prediction = loaded_model.predict(input_data_feature);

if(prediction[0] == 1):
  print("Spam Mail")
else:
  print("Ham mail")


Ham mail
