## Create a model to check spam mails
I am going to use RandomForest based on results, if you found a better model then use it

In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [102]:
df =  pd.read_csv(r"C:\Users\SHIVANG DWIVEDI\Desktop\Gmail_spam_filter\new.csv")
df

Unnamed: 0,Category,Message
0,ham,Go until jurong point crazy.. Available only i...
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
16435,ham,re : research and development charges to gpg ...
16436,ham,re : receipts from visit jim thanks again f...
16437,ham,re : enron case study update wow ! all on the...
16438,ham,re : interest david please call shirley cr...


In [103]:
# Load the CSV file
data = pd.read_csv('new.csv')

In [104]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [105]:
data.dtypes

Category    object
Message     object
dtype: object

In [106]:
# Replace 'spam' with 0 and 'ham' with 1 in the 'Category' column
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [107]:
data['Category'].value_counts()

Category
1    12846
0     3594
Name: count, dtype: int64

In [108]:
X = data['Message']

Y = data['Category']

In [109]:
X

0        Go until jurong point crazy.. Available only i...
1                            Ok lar... Joking wif u oni...
2        Free entry in 2 a wkly comp to win FA Cup fina...
3        U dun say so early hor... U c already then say...
4        Nah I dont think he goes to usf he lives aroun...
                               ...                        
16435    re : research and development charges to gpg  ...
16436    re : receipts from visit  jim   thanks again f...
16437    re : enron case study update  wow ! all on the...
16438    re : interest  david   please  call shirley cr...
16439    news : aurora 5 . 2 update  aurora version 5 ....
Name: Message, Length: 16440, dtype: object

In [110]:
Y

0        1
1        1
2        0
3        1
4        1
        ..
16435    1
16436    1
16437    1
16438    1
16439    1
Name: Category, Length: 16440, dtype: object

In [111]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [112]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(16440,)
(13152,)
(3288,)


In [113]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase = 'True')

In [114]:
feature_extraction = TfidfVectorizer(lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [115]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

### LOGISTIC REGRESSION

In [116]:
lr = LogisticRegression()
lr.fit(X_train_features, Y_train)


lr_train = lr.predict(X_train_features)
lr_test = lr.predict(X_test_features)

        
lr_train_acc = accuracy_score(Y_train, lr_train)
lr_test_acc = accuracy_score(Y_test, lr_test)


lr_precision = precision_score(Y_test, lr_test)
lr_recall = recall_score(Y_test, lr_test)
lr_f1 = f1_score(Y_test, lr_test)


print("Logistic Regression:\n")
print("Training Data Accuracy:", lr_train_acc)
print("Testing Data Accuracy :", lr_test_acc)

print("Precision             :", lr_precision)
print("Recall                :", lr_recall)
print("F1 Score              :", lr_f1)

Logistic Regression:

Training Data Accuracy: 0.9704987834549879
Testing Data Accuracy : 0.9540754257907542
Precision             : 0.9496644295302014
Recall                : 0.9937573156457277
F1 Score              : 0.971210676835081


### DECISION TREES

In [117]:
dtrees = DecisionTreeClassifier()
dtrees.fit(X_train_features, Y_train)


dt_train = dtrees.predict(X_train_features)
dt_test = dtrees.predict(X_test_features)


dt_train_acc = accuracy_score(Y_train, dt_train)
dt_test_acc = accuracy_score(Y_test, dt_test)


dt_precision = precision_score(Y_test, dt_test)
dt_recall = recall_score(Y_test, dt_test)
dt_f1 = f1_score(Y_test, dt_test)


print("Decision Tress:\n")
print("Training Data Accuracy:", dt_train_acc)
print("Testing Data Accuracy :", dt_test_acc)

print("Precision             :", dt_precision)
print("Recall                :", dt_recall)
print("F1 Score              :", dt_f1)

Decision Tress:

Training Data Accuracy: 0.9999239659367397
Testing Data Accuracy : 0.9333941605839416
Precision             : 0.9546159813809154
Recall                : 0.9602028872415138
F1 Score              : 0.9574012837969267


### RANDOM FOREST

In [118]:
rf = RandomForestClassifier()
rf.fit(X_train_features, Y_train)


rf_train = rf.predict(X_train_features)
rf_test = rf.predict(X_test_features)


rf_train_acc = accuracy_score(Y_train, rf_train)
rf_test_acc = accuracy_score(Y_test, rf_test)


rf_precision = precision_score(Y_test, rf_test)
rf_recall = recall_score(Y_test, rf_test)
rf_f1 = f1_score(Y_test, rf_test)


print("Random Forest:\n")
print("Training Data Accuracy:", rf_train_acc)
print("Testing Data Accuracy :", rf_test_acc)   
           
print("Precision             :", rf_precision)
print("Recall                :", rf_recall)
print("F1 Score              :", rf_f1)

Random Forest:

Training Data Accuracy: 0.9999239659367397
Testing Data Accuracy : 0.9522506082725061
Precision             : 0.9442392909896603
Recall                : 0.9976589933671479
F1 Score              : 0.9702143805729463


### Metrics Visualization

In [119]:
train_acc_list = {"LR":lr_train_acc,
                  "DT":dt_train_acc,
                  "RF":rf_train_acc,}

test_acc_list = {"LR":lr_test_acc,
                  "DT":dt_test_acc,
                  "RF":rf_test_acc,}

precision_list = {"LR":lr_precision,
                  "DT":dt_precision,
                  "RF":rf_precision,}

recall_list = {"LR":lr_recall,
               "DT":dt_recall,
               "RF":rf_recall,}

f1_list = {"LR":lr_f1,
               "DT":dt_f1,
               "RF":rf_f1,}

In [120]:
a1 =  pd.DataFrame.from_dict(train_acc_list, orient = 'index', columns = ["Traning Accuracy"])
a2 =  pd.DataFrame.from_dict(test_acc_list, orient = 'index', columns = ["Testing Accuracy"])
a3 =  pd.DataFrame.from_dict(precision_list, orient = 'index', columns = ["Precision Score"])
a4 =  pd.DataFrame.from_dict(recall_list, orient = 'index', columns = ["Recall Score"])
a5 =  pd.DataFrame.from_dict(f1_list, orient = 'index', columns = ["F1 Score"])

org = pd.concat([a1, a2, a3, a4, a5], axis = 1)
org

Unnamed: 0,Traning Accuracy,Testing Accuracy,Precision Score,Recall Score,F1 Score
LR,0.970499,0.954075,0.949664,0.993757,0.971211
DT,0.999924,0.933394,0.954616,0.960203,0.957401
RF,0.999924,0.952251,0.944239,0.997659,0.970214


In [121]:
# For the input mail prediction
input_mail = ["Dear Profesor/Students Greetings from VIT Bhopal University.We are glad to announce that VIT Bhopal University (VITB), in association with the International Society for Air Breathing Engines (ISABE), is organizing the “International Conference on Sustainable Aerospace Technologies and Innovations -2025” (ICSATI-2025), during 07-08th February 2025.This a unique stage we are preparing for you to showcase your research with eminent experts from  ISRO, VSSC, Rolls Royce, TASL, DRDO, IITM, IITB, Cranfield University-UK, and the University of Patras-Greece.  All accepted papers will be published in Springer Proceedings, providing a valuable platform for disseminating your research to a global audience Important Deadlines to Note.Abstract Submission: On or Before December 30, 2024 Full-Length Paper Submission: January 15, 2025 For Conference Website- https://icsati.vercel.app/"]

# Transform the input mail features
input_mail_features = feature_extraction.transform(input_mail)

# Predict with the trained Random Forest model
prediction = dtrees.predict(input_mail_features)

# Output the result
if prediction == 0:
    print("SPAM MAIL")
else:
    print("HAM MAIL")

HAM MAIL


## Checks emails from Gmail
If a spam is found then it gets moved to spam folder in Gmail itself
print the date and time of start and input has to be a one line remove unwanted paragraph and double " since it will be checked based on strind input 

## With GUI

In [122]:
import imaplib
import email
import time
from email.header import decode_header
import re
import password
import threading
from tkinter import *
from tkinter import messagebox

# Gmail credentials
EMAIL = password.EMAIL  # Replace with your Gmail address
PASSWORD = password.PASSWORD  # Replace with the app password you generated
IMAP_SERVER = "imap.gmail.com" # Use only with Gmail 
CHECK_INTERVAL = 30  # CoolDown period

# Global variable to control the email checker loop
stop_checker = False
spam_count = 0  # Counter for spam emails

In [123]:
def fetch_and_predict_spam():
    """Function to fetch emails and predict spam."""
    global stop_checker, spam_count
    try:
        # Connect to Gmail's IMAP server
        mail = imaplib.IMAP4_SSL(IMAP_SERVER)
        mail.login(EMAIL, PASSWORD)

        while not stop_checker:
            mail.select("inbox")
            status, messages = mail.search(None, "ALL")
            email_ids = messages[0].split()

            if email_ids:
                latest_email_id = email_ids[-1]
                status, msg_data = mail.fetch(latest_email_id, "(RFC822)")

                for response_part in msg_data:
                    if isinstance(response_part, tuple):
                        msg = email.message_from_bytes(response_part[1])
                        subject, encoding = decode_header(msg["Subject"])[0]
                        if isinstance(subject, bytes):
                            subject = subject.decode(encoding if encoding else "utf-8")
                        print(f"Subject: {subject}")

                        body = ""
                        if msg.is_multipart():
                            for part in msg.walk():
                                content_type = part.get_content_type()
                                content_disposition = str(part.get("Content-Disposition"))

                                if content_type == "text/plain" and "attachment" not in content_disposition:
                                    body = part.get_payload(decode=True).decode("utf-8")
                                    break
                        else:
                            body = msg.get_payload(decode=True).decode("utf-8")

                        body_cleaned = re.sub(r'\s+', ' ', body).strip()
                        body_cleaned = body_cleaned.replace('"', '')
                        print(f"Cleaned Body: {body_cleaned}")

                        # Feed the cleaned email body into the model
                        input_mail_features = feature_extraction.transform([body_cleaned])
                        prediction = dtrees.predict(input_mail_features)

                        if prediction == 0:
                            print("SPAM MAIL")
                            mail.create("[Gmail]/Spam")
                            mail.copy(latest_email_id, "[Gmail]/Spam")
                            mail.store(latest_email_id, '+FLAGS', '\\Deleted')
                            mail.expunge()
                            print("Email moved to Spam folder.")

                            # Log spam email subject to a file
                            with open("spam_emails.txt", "a") as file:
                                file.write(f"{subject}\n")

                            spam_count += 1  # Increment the spam count
                            update_status_label() 
                        else:
                            print("HAM MAIL (Not spam).")
            else:
                print("No emails found.")

            time.sleep(CHECK_INTERVAL)

        mail.logout()
    except Exception as e:
        print(f"An error occurred: {e}")

# Function to start the email checker in a thread
def start_checker():
    global stop_checker, spam_count
    stop_checker = False
    spam_count = 0  # Reset spam count when starting the checker
    checker_thread = threading.Thread(target=fetch_and_predict_spam, daemon=True)
    checker_thread.start()
    update_status_label()  # Update the label when the checker starts

# Function to stop the email checker
def stop_checker_func():
    global stop_checker
    stop_checker = True
    update_status_label()  # Update the label when the checker stops

# Function to update the status label with the current state and spam count
def update_status_label():
    if stop_checker:
        status_label.config(text="Spam Checker Stopped !!!")
    else:
        status_label.config(text=f"Spam Checker Running.... {spam_count} spam emails caught!")

# Create the Tkinter UI
root = Tk()
root.title("Email Spam Checker")
root.geometry("400x250")

Label(root, text="Email Spam Checker", font=("Helvetica", 16)).pack(pady=20)

# Status label to show the current status and spam count
status_label = Label(root, text="Spam Checker Stopped !!!", font=("Helvetica", 12))
status_label.pack(pady=10)

start_button = Button(root, text="Start Checker", font=("Helvetica", 12), command=start_checker, bg="green", fg="white")
start_button.pack(pady=10)

stop_button = Button(root, text="Stop Checker", font=("Helvetica", 12), command=stop_checker_func, bg="red", fg="white")
stop_button.pack(pady=10)

# Run the Tkinter event loop
root.mainloop()

## Without GUI
Checks the gmail inbox only once and stops the execution after checking the last email 

In [124]:
# import imaplib
# import password
# import email
# from email.header import decode_header
# import re  # For cleaning up the email body

# # Gmail credentials
# EMAIL = password.EMAIL  # Replace with your Gmail address
# PASSWORD = password.PASSWORD  # Replace with the app password you generated
# IMAP_SERVER = "imap.gmail.com"  # IMAP server for Gmail

# def fetch_and_predict_spam():
#     try:
#         # Connect to Gmail's IMAP server
#         mail = imaplib.IMAP4_SSL(IMAP_SERVER)
#         mail.login(EMAIL, PASSWORD)

#         # Select the inbox folder
#         mail.select("inbox")

#         # Search for all emails in the inbox
#         status, messages = mail.search(None, "ALL")
#         email_ids = messages[0].split()

#         if email_ids:
#             latest_email_id = email_ids[-1]  # Get the last email
#             status, msg_data = mail.fetch(latest_email_id, "(RFC822)")

#             for response_part in msg_data:
#                 if isinstance(response_part, tuple):
#                     # Parse email content
#                     msg = email.message_from_bytes(response_part[1])
#                     subject, encoding = decode_header(msg["Subject"])[0]
#                     if isinstance(subject, bytes):
#                         # Decode subject
#                         subject = subject.decode(encoding if encoding else "utf-8")
#                     print(f"Subject: {subject}")

#                     # Extract email body
#                     body = ""
#                     if msg.is_multipart():
#                         for part in msg.walk():
#                             content_type = part.get_content_type()
#                             content_disposition = str(part.get("Content-Disposition"))

#                             if content_type == "text/plain" and "attachment" not in content_disposition:
#                                 # Get the email body
#                                 body = part.get_payload(decode=True).decode("utf-8")
#                                 break
#                     else:
#                         # Email is not multipart
#                         body = msg.get_payload(decode=True).decode("utf-8")

#                     # Clean up email body
#                     body_cleaned = re.sub(r'\s+', ' ', body).strip()  # Remove newlines and extra spaces
#                     body_cleaned = body_cleaned.replace('"', '')  # Remove double quotes
#                     print(f"Cleaned Body: {body_cleaned}")

#                     # Feed the cleaned email body into the model
#                     input_mail_features = feature_extraction.transform([body_cleaned])
#                     prediction = rf.predict(input_mail_features)

#                     if prediction == 0:
#                         print("SPAM MAIL")
#                         # Move the email to the Spam folder
#                         mail.create("[Gmail]/Spam")  # Ensure the Spam folder exists
#                         mail.copy(latest_email_id, "[Gmail]/Spam")  # Copy the email to Spam
#                         mail.store(latest_email_id, '+FLAGS', '\\Deleted')  # Mark it for deletion
#                         mail.expunge()  # Expunge to permanently delete it from Inbox
#                         print("Email moved to Spam folder.")
#                     else:
#                         print("HAM MAIL (Not spam).")

#         else:
#             print("No emails found.")

#         # Logout from the server
#         mail.logout()

#     except Exception as e:
#         print(f"An error occurred: {e}")

# # Run the function
# fetch_and_predict_spam()
