<a href="https://colab.research.google.com/github/RhenisonS/NLP_Project_Automated-Email-Classification-Using-GenAI-Enhanced-Models/blob/main/nlp_proj_Automated_Email_Classification_Using_GenAI_Enhanced_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install -q scikit-learn transformers sentence-transformers pandas

In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

from sentence_transformers import SentenceTransformer


In [19]:
data = {
    "email": [
        "Win a free iPhone now",
        "Big sale on electronics this weekend",
        "Your order has not been delivered",
        "Hey are we meeting tomorrow",
        "Limited time offer buy now",
        "My internet is not working please help",
        "Happy birthday have a great day",
        "Claim your lottery prize"
    ],
    "label": [
        "Spam", "Promotions", "Support", "Personal",
        "Promotions", "Support", "Personal", "Spam"
    ]
}

df = pd.DataFrame(data)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    df["email"],
    df["label"],
    test_size=0.5,
    random_state=42,
    stratify=df["label"]
)

In [21]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

tfidf_model = LogisticRegression(max_iter=1000)
tfidf_model.fit(X_train_tfidf, y_train)

tfidf_preds = tfidf_model.predict(X_test_tfidf)




In [22]:
print("\n===== TF-IDF MODEL =====")
print(classification_report(y_test, tfidf_preds, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, tfidf_preds))


===== TF-IDF MODEL =====
              precision    recall  f1-score   support

    Personal       0.25      1.00      0.40         1
  Promotions       0.00      0.00      0.00         1
        Spam       0.00      0.00      0.00         1
     Support       0.00      0.00      0.00         1

    accuracy                           0.25         4
   macro avg       0.06      0.25      0.10         4
weighted avg       0.06      0.25      0.10         4

Confusion Matrix:
 [[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]]


In [23]:
genai_model = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = genai_model.encode(X_train.tolist())
X_test_emb = genai_model.encode(X_test.tolist())

genai_clf = LogisticRegression(max_iter=1000)
genai_clf.fit(X_train_emb, y_train)

genai_preds = genai_clf.predict(X_test_emb)

print("\n===== GenAI MODEL =====")
print(classification_report(y_test, genai_preds, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, genai_preds))

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



===== GenAI MODEL =====
              precision    recall  f1-score   support

    Personal       0.00      0.00      0.00         1
  Promotions       0.50      1.00      0.67         1
        Spam       1.00      1.00      1.00         1
     Support       1.00      1.00      1.00         1

    accuracy                           0.75         4
   macro avg       0.62      0.75      0.67         4
weighted avg       0.62      0.75      0.67         4

Confusion Matrix:
 [[0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]]


In [24]:
def classify_email(email_text):
    emb = genai_model.encode([email_text])
    return genai_clf.predict(emb)[0]

In [25]:
test_email = "Your account has been suspended verify immediately"
print("\nReal-Time Prediction:", classify_email(test_email))



Real-Time Prediction: Support
