In [18]:
pip install pandas nltk scikit-learn fuzzywuzzy langdetect googletrans==4.0.0-rc1




In [19]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from fuzzywuzzy import process
from sklearn.utils import resample
from langdetect import detect
from googletrans import Translator

In [20]:
# Download stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
translator = Translator()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Load dataset
df = pd.read_csv("/content/mail_data.csv")

In [22]:
# Text preprocessing
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\W", " ", text)
    text = text.lower()
    return " ".join([word for word in text.split() if word not in stop_words])

In [23]:
# Translate non-English emails
def detect_and_translate(text):
    try:
        lang = detect(text)
        if lang != 'en':
            return translator.translate(text, src=lang, dest='en').text
        return text
    except:
        return text

In [24]:
# Apply translation and cleaning
df['translated'] = df['Message'].astype(str).apply(detect_and_translate)
df['clean_text'] = df['translated'].apply(clean_text)

In [25]:
# Fuzzy keyword-based categories
# Define keyword categories
categories = {
    "Primary": [
        "meeting", "schedule", "report", "deadline", "appointment", "conference", "memo",
        "notification", "announcement", "official", "request", "update", "reminder"
    ],
    "Promotions": [
        "discount", "sale", "offer", "limited-time", "deal", "promo", "exclusive", "voucher",
        "coupon", "save", "clearance", "cashback", "redeem", "flash sale", "gift", "reward"
    ],
    "Social": [
        "Facebook", "Twitter", "LinkedIn", "invitation", "follow", "comment", "like",
        "mention", "message", "friend request", "tagged", "shared", "chat", "DM"
    ],
    "Spam": [
        "winner", "lottery", "free money", "urgent", "congratulations", "claim", "prize",
        "guaranteed", "limited offer", "click here", "no cost", "100% free", "risk-free",
        "investment opportunity", "exclusive deal", "one-time offer"
    ],
    "Work": [
        "project", "boss", "team", "update", "task", "assignment", "collaboration", "report",
        "presentation", "client", "deadline", "status update", "meeting notes", "strategy"
    ],
    "Personal": [
        "family", "dinner", "vacation", "call", "love", "birthday", "weekend", "mom", "dad",
        "brother", "sister", "friend", "catch-up", "get-together", "anniversary", "party"
    ],
}

In [26]:

def classify_email(text):
    text = text.lower()
    best_match = None
    highest_score = 0
    for category, keywords in categories.items():
        for keyword in keywords:
            score = process.extractOne(keyword, text.split())
            if score and score[1] > highest_score:
                highest_score = score[1]
                best_match = category
    return best_match if highest_score > 70 else "Custom"

df["category"] = df["clean_text"].apply(classify_email)

In [27]:
# Balance the dataset
df_custom = df[df["category"] == "Custom"]
df_others = df[df["category"] != "Custom"]
n_samples = min(len(df_custom), len(df_others))
df_custom_balanced = resample(df_custom, replace=False, n_samples=n_samples, random_state=42)
df_balanced = pd.concat([df_custom_balanced, df_others])

In [28]:
# Encode target
label_encoder = LabelEncoder()
df_balanced["category_encoded"] = label_encoder.fit_transform(df_balanced["category"])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = tfidf_vectorizer.fit_transform(df_balanced["clean_text"])
y = df_balanced["category_encoded"]

In [29]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:

# GridSearchCV for Random Forest
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf_model = grid_rf.best_estimator_
print(" Tuned Random Forest Accuracy:", accuracy_score(y_test, best_rf_model.predict(X_test)))

# GridSearchCV for SVM
svm_params = {
    'C': [1, 10],
    'kernel': ['linear']
}
grid_svm = GridSearchCV(SVC(random_state=42), svm_params, cv=3, n_jobs=-1)
grid_svm.fit(X_train, y_train)
best_svm_model = grid_svm.best_estimator_
print(" Tuned SVM Accuracy:", accuracy_score(y_test, best_svm_model.predict(X_test)))

# Naive Bayes (no tuning needed)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_model.predict(X_test)))

# Feedback loop
def user_feedback_loop(email, predicted_category):
    print(f"\n Email: {email}")
    print(f" Predicted Category: {predicted_category}")
    user_input = input(" Enter correct category or press Enter to accept: ")
    return user_input.strip() if user_input else predicted_category

 Tuned Random Forest Accuracy: 0.6493273542600897
 Tuned SVM Accuracy: 0.6080717488789238
Naive Bayes Accuracy: 0.4923766816143498


In [31]:
# Sample multilingual test
sample_emails = [
    "¡Felicidades! Has ganado un premio. Haz clic aquí para reclamarlo.",
    "Reminder: Your project deadline is tomorrow at 5 PM.",
    "पारिवारिक डिनर इस सप्ताह के अंत में, आइए योजना बनाएं।",
    "Biggest sale of the season! Flat 70% off on all items.",
    "John tagged you in a post on Facebook.",
    "Chère équipe, n'oubliez pas de soumettre le rapport mensuel."
]

In [32]:
print("\n Predictions with Feedback Loop:")
for email in sample_emails:
    translated = detect_and_translate(email)
    cleaned = clean_text(translated)
    vec = tfidf_vectorizer.transform([cleaned])

    rf_pred = label_encoder.inverse_transform(best_rf_model.predict(vec))[0]
    corrected_category = user_feedback_loop(email, rf_pred)
    print(f" Final Category (after feedback): {corrected_category}")


 Predictions with Feedback Loop:

 Email: ¡Felicidades! Has ganado un premio. Haz clic aquí para reclamarlo.
 Predicted Category: Spam
 Enter correct category or press Enter to accept: 
 Final Category (after feedback): Spam

 Email: Reminder: Your project deadline is tomorrow at 5 PM.
 Predicted Category: Work
 Enter correct category or press Enter to accept: 
 Final Category (after feedback): Work

 Email: पारिवारिक डिनर इस सप्ताह के अंत में, आइए योजना बनाएं।
 Predicted Category: Personal
 Enter correct category or press Enter to accept: 
 Final Category (after feedback): Personal

 Email: Biggest sale of the season! Flat 70% off on all items.
 Predicted Category: Promotions
 Enter correct category or press Enter to accept: 
 Final Category (after feedback): Promotions

 Email: John tagged you in a post on Facebook.
 Predicted Category: Social
 Enter correct category or press Enter to accept: 
 Final Category (after feedback): Social

 Email: Chère équipe, n'oubliez pas de soumettre

In [34]:

# Save models
with open("best_rf_model.pkl", "wb") as f: pickle.dump(best_rf_model, f)
with open("best_svm_model.pkl", "wb") as f: pickle.dump(best_svm_model, f)
with open("nb_model.pkl", "wb") as f: pickle.dump(nb_model, f)
with open("tfidf_vectorizer.pkl", "wb") as f: pickle.dump(tfidf_vectorizer, f)
with open("label_encoder.pkl", "wb") as f: pickle.dump(label_encoder, f)

print("\n Models and vectorizer saved successfully.")



 Models and vectorizer saved successfully.


In [35]:
from google.colab import files

files.download("best_rf_model.pkl")
files.download("best_svm_model.pkl")
files.download("nb_model.pkl")
files.download("tfidf_vectorizer.pkl")
files.download("label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>