Importing Packages & Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Loading our data into a pandas DataFrame

In [None]:

raw_mail_data = pd.read_csv(r"C:\\Users\\noaha\\Downloads\\mail_data.csv")


print (raw_mail_data)


     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [None]:
#Cleaning the Data 

In [None]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
print (mail_data.shape)

(5572, 2)


In [127]:
#Change spam to 0 and ham to 1 - Ham is real mail
#SPAM = 0
#REAL = 1

In [None]:
mail_data.loc[mail_data['Category'] == 'spam' ,'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham' ,'Category',] = 1
print (mail_data)

     Category                                            Message
0           1  Go until jurong point, crazy.. Available only ...
1           1                      Ok lar... Joking wif u oni...
2           0  Free entry in 2 a wkly comp to win FA Cup fina...
3           1  U dun say so early hor... U c already then say...
4           1  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567        0  This is the 2nd time we have tried 2 contact u...
5568        1               Will ü b going to esplanade fr home?
5569        1  Pity, * was in mood for that. So...any other s...
5570        1  The guy did some bitching but I acted like i'd...
5571        1                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [None]:
X = mail_data['Message']

Y = mail_data['Category']

In [130]:
print (X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [131]:
#Splitting the data into training data & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, train_size=0.8, random_state = 3) #test size is how much data we'll be using to test and the other way around for train size and random state will split the data into the same manner

In [None]:
print (X_train.shape)
print (X_test.shape)
print (X.shape)

(4457,)
(1115,)
(5572,)


In [134]:
#Feature Extraction - transform the text data to feature vectors that can be used as input to the logisitc regression model

In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
#This will score the words based on how many times it's used that has been passed through the data frame 
#This will allow our model to flag mail based on which words are used 
#if the word is used less than one time the model won't pick it up
#stop words english will not include common words that are always used in english that don't have much meaning such as 'The, it , where' etc

X_train_features =  feature_extraction.fit_transform(X_train)
X_test_features =  feature_extraction.transform(X_test)
#The train data will be fit into the vectorizer however I will not need to fit the test data as I only want to apply the same transformation reason for this is because I don't want it to relearn anything.

# Convert the Y train and test data into intergers 
Y_train_features =  Y_train.astype('int')
Y_test_features =  Y_test.astype('int')

In [None]:
print (X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [137]:
#Training the model

Logisitc Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features, Y_train_features)

evaluating the model

prediciton on training model 

In [None]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data  = accuracy_score(Y_train_features,prediction_on_training_data)

In [None]:
print ('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9676912721561588


In [None]:
# Assuming prediction_on_test_data is obtained from X_test_features
prediction_on_test_data = model.predict(X_test_features)

# Calculate accuracy using the predictions made on the test data
accuracy_on_test_data = accuracy_score(Y_test_features, prediction_on_test_data)

In [None]:
print ('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9668161434977578


In [None]:
#Serialization of the model
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
import imaplib
import sys
# IMAP server settings for Gmail
IMAP_SERVER = 'imap.gmail.com
IMAP_PORT = 993

EMAIL_ADDRESS = '' # Replace with your own email address
PASSWORD = '' # Replace with your own password (For google you'll neeed two step verification and then request for a secret key to use as password)

# Connect to the IMAP server
imap_conn = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT)

# Login to the server
imap_conn.login(EMAIL_ADDRESS, PASSWORD)

# Select the mailbox (e.g., INBOX)
imap_conn.select('INBOX')

# Example: Fetch the latest email
# status, message_data = imap_conn.fetch(b'1', '(RFC822)')
print(message_data[0][1])

# Logout and close the connection
# imap_conn.logout()


In [None]:
status, message_data = imap_conn.search(None, 'UNSEEN')  # Fetch only unread emails
message_ids = message_data[0].split()

for message_id in message_ids:
    status, message_data = imap_conn.fetch(message_id, '(BODY[TEXT])')
    raw_email = message_data[0][1].decode('utf-8')
    email_message = email.message_from_string(raw_email)
    email_body = email_message.get_payload()

    # Preprocess the email body (e.g., remove HTML tags, convert to lowercase, etc.)
    #...

    # Extract the features from the email body using the same TfidfVectorizer instance used during training
    feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
    email_features = feature_extraction.transform([email_body])

    # Predict the category (spam or ham) using the trained model
    prediction = model.predict(email_features)

    # Print the prediction and the email subject
    print(f"Prediction: {'Spam' if prediction[0] == 0 else 'Ham'}")
    print(f"Subject: {email_message['Subject']}")