# Email Spam Classifier Using Gmail and Kaggle Dataset

In [17]:
#Importing required libraries
import os
import base64
import pickle
import logging
import pandas as pd
import numpy as np
import re
import string

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [21]:
# Setup logging
logging.basicConfig(level=logging.INFO)

# SCOPES for Gmail API
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] #Enable Api from Google developer console 

# Gmail Authentication is required of test subject for that you have to enter the given gmail id from which you want to read
def authenticate_gmail():
    logging.info("Authenticating with Gmail API")
    creds = None

    # If corrupted token.pickle exists, delete it:this generally happens when the gmail  doesnt allow or has been error in authentication process
    if os.path.exists('token.pickle'):
        try:
            with open('token.pickle', 'rb') as token:
                creds = pickle.load(token)
        except EOFError:
            logging.warning("token.pickle is corrupted. Deleting and re-authenticating.")
            os.remove('token.pickle')
            creds = None
#If the given api credentials are invalid then  it will dump the token and refreshes it  so make sure to finish OAuth configuration to get proper credentials.json file
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('gmail', 'v1', credentials=creds)
    return service

# Fetch emails from Gmail
def fetch_gmail_data(service, max_results=50):
    logging.info("Fetching emails from Gmail")
    results = service.users().messages().list(userId='me', maxResults=max_results).execute()
    messages = results.get('messages', [])
    snippets = []
    for msg in messages:
        msg_data = service.users().messages().get(userId='me', id=msg['id']).execute()
        snippets.append(msg_data.get('snippet', ''))
    return snippets

# Preprocessing function:Removing unnecessary white space ,punctuations and other stuff
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load static dataset
mail_data_df = pd.read_csv("mail_data.csv")
mail_data_df = mail_data_df.rename(columns={'Message': 'Snippet'})
mail_data_df['Cleaned_Snippet'] = mail_data_df['Snippet'].apply(preprocess_text)

# Get Gmail data
service = authenticate_gmail()
gmail_snippets = fetch_gmail_data(service)
gmail_df = pd.DataFrame({'Snippet': gmail_snippets})
gmail_df['Category'] = 'ham'  # Labeling real-time messages as ham by default
gmail_df['Cleaned_Snippet'] = gmail_df['Snippet'].apply(preprocess_text)

# Combine both datasets: realtime data +kaggle dataset for gmail it is done to balance the dataset
combined_df = pd.concat([mail_data_df, gmail_df], ignore_index=True)

# Prepare data
X = combined_df['Cleaned_Snippet']
y = combined_df['Category'].astype(str)  # Ensure labels are strings

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# TF-IDF Vectorizer: converts text into numerical features using Bag of words model .it is refined  bag of words model
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
def predict_spam(text, model, vectorizer, threshold=0.3):
    """
    Predict whether a single email snippet is spam or not based on a threshold.
    
    Parameters:
    - text: raw email snippet
    - model: trained logistic regression model
    - vectorizer: fitted TfidfVectorizer
    - threshold: probability threshold for classifying as spam

    Returns:
    - prediction: 'spam' or 'ham'
    - probability: spam probability score
    """
    cleaned_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([cleaned_text])
    prob = model.predict_proba(vectorized_text)[0][model.classes_.tolist().index('spam')]
    
    prediction = 'spam' if prob >= threshold else 'ham'
    return prediction, prob
test_cases = [
    "Your Amazon order has been shipped. Track your package here.",
    "Win a free vacation to Maldives! Limited time offer.",
    "Reminder: Your meeting with the professor is scheduled for tomorrow at 10 AM.",
    "Your Account has been compromised click here for help!",
    "Hey, can you send me the assignment file before class?"
]
# takes index and test case by test case and checks if it is a spam or ham
for i, email in enumerate(test_cases, 1):
    prediction, probability = predict_spam(email, model, tfidf_vectorizer, threshold=0.15)
    print(f"Test Case {i}:")
    print(f"  Text: {email}")
    print(f"  Prediction: {prediction}")
    print(f"  Spam Probability: {probability:.2f}")
   



INFO:root:Authenticating with Gmail API
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:root:Fetching emails from Gmail


Accuracy: 0.9573206876111441
Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1463
        spam       1.00      0.68      0.81       224

    accuracy                           0.96      1687
   macro avg       0.98      0.84      0.89      1687
weighted avg       0.96      0.96      0.95      1687

Test Case 1:
  Text: Your Amazon order has been shipped. Track your package here.
  Prediction: ham
  Spam Probability: 0.13
Test Case 2:
  Text: Win a free vacation to Maldives! Limited time offer.
  Prediction: spam
  Spam Probability: 0.45
Test Case 3:
  Text: Reminder: Your meeting with the professor is scheduled for tomorrow at 10 AM.
  Prediction: ham
  Spam Probability: 0.06
Test Case 4:
  Text: Your Account has been compromised click here for help!
  Prediction: spam
  Spam Probability: 0.17
Test Case 5:
  Text: Hey, can you send me the assignment file before class?
  Prediction: ham
  Spam Probability: 0