In [None]:
pip install fuzzywuzzy




In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from fuzzywuzzy import process
from sklearn.utils import resample

In [None]:

# Download NLTK stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load dataset
df = pd.read_csv("/content/mail_data.csv")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Clean text function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df["clean_text"] = df["Message"].apply(clean_text)

In [None]:
# Define keyword categories
categories = {
    "Primary": [
        "meeting", "schedule", "report", "deadline", "appointment", "conference", "memo",
        "notification", "announcement", "official", "request", "update", "reminder"
    ],
    "Promotions": [
        "discount", "sale", "offer", "limited-time", "deal", "promo", "exclusive", "voucher",
        "coupon", "save", "clearance", "cashback", "redeem", "flash sale", "gift", "reward"
    ],
    "Social": [
        "Facebook", "Twitter", "LinkedIn", "invitation", "follow", "comment", "like",
        "mention", "message", "friend request", "tagged", "shared", "chat", "DM"
    ],
    "Spam": [
        "winner", "lottery", "free money", "urgent", "congratulations", "claim", "prize",
        "guaranteed", "limited offer", "click here", "no cost", "100% free", "risk-free",
        "investment opportunity", "exclusive deal", "one-time offer"
    ],
    "Work": [
        "project", "boss", "team", "update", "task", "assignment", "collaboration", "report",
        "presentation", "client", "deadline", "status update", "meeting notes", "strategy"
    ],
    "Personal": [
        "family", "dinner", "vacation", "call", "love", "birthday", "weekend", "mom", "dad",
        "brother", "sister", "friend", "catch-up", "get-together", "anniversary", "party"
    ],
}


In [None]:
# Improved Email Classification using Fuzzy Matching
def classify_email(text):
    text = text.lower()
    best_match = None
    highest_score = 0

    for category, keywords in categories.items():
        for keyword in keywords:
            score = process.extractOne(keyword, text.split())  # Match words in text
            if score and score[1] > highest_score:
                highest_score = score[1]
                best_match = category

    return best_match if highest_score > 70 else "Custom"

df["category"] = df["clean_text"].apply(classify_email)


In [None]:
# Balance the dataset
df_custom = df[df["category"] == "Custom"]
df_others = df[df["category"] != "Custom"]
from sklearn.utils import resample

# Get the actual number of "Custom" and non-"Custom" samples
n_samples_to_match = min(len(df_custom), len(df_others))  # Prevent oversampling

df_custom_downsampled = resample(df_custom,
                                 replace=False,
                                 n_samples=n_samples_to_match,  # Match the smaller group
                                 random_state=42)

df_balanced = pd.concat([df_custom_downsampled, df_others])


In [None]:

# Encode categories
label_encoder = LabelEncoder()
df_balanced["category_encoded"] = label_encoder.fit_transform(df_balanced["category"])

# Convert email text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf_vectorizer.fit_transform(df_balanced["clean_text"])
y = df_balanced["category_encoded"]

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")

Random Forest Accuracy: 0.68


In [None]:
# Train SVM Classifier
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")


SVM Accuracy: 0.62


In [None]:

# Train Naïve Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print(f"Naïve Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.2f}")

Naïve Bayes Accuracy: 0.51


In [None]:
# Test Predictions
test_emails = [
    "Meeting scheduled for tomorrow",
    "Exclusive sale! Get 70% off today",
    "Congratulations! You won a lottery",
    "Let's plan a family dinner this weekend"
]

In [None]:
test_emails = [
    "Reminder: Your meeting with the project team is scheduled for tomorrow at 10 AM.",
    "Flash Sale! Get 50% off on all electronic items. Limited-time offer, shop now!",
    "John mentioned you in a comment on Facebook. Click here to view the conversation.",
    "Congratulations! You have won a free iPhone. Claim your prize now before it expires!",
    "Team, please submit your weekly status report by Friday EOD.",
    "Hey, let's plan a family dinner this weekend. Let me know your availability!",
    "Hello! Just checking in. How have you been?"
]

In [None]:



for email in test_emails:
    new_email_tfidf = tfidf_vectorizer.transform([email])
    predicted_category_rf = label_encoder.inverse_transform(rf_model.predict(new_email_tfidf))
    predicted_category_svm = label_encoder.inverse_transform(svm_model.predict(new_email_tfidf))
    predicted_category_nb = label_encoder.inverse_transform(nb_model.predict(new_email_tfidf))

    print(f"Email: {email}")
    print(f"RF Prediction: {predicted_category_rf[0]}")
    print(f"SVM Prediction: {predicted_category_svm[0]}")
    print(f"NB Prediction: {predicted_category_nb[0]}")
    print("-" * 50)


Email: Reminder: Your meeting with the project team is scheduled for tomorrow at 10 AM.
RF Prediction: Primary
SVM Prediction: Work
NB Prediction: Primary
--------------------------------------------------
Email: Flash Sale! Get 50% off on all electronic items. Limited-time offer, shop now!
RF Prediction: Promotions
SVM Prediction: Promotions
NB Prediction: Promotions
--------------------------------------------------
Email: John mentioned you in a comment on Facebook. Click here to view the conversation.
RF Prediction: Social
SVM Prediction: Social
NB Prediction: Social
--------------------------------------------------
Email: Congratulations! You have won a free iPhone. Claim your prize now before it expires!
RF Prediction: Spam
SVM Prediction: Spam
NB Prediction: Spam
--------------------------------------------------
Email: Team, please submit your weekly status report by Friday EOD.
RF Prediction: Work
SVM Prediction: Primary
NB Prediction: Primary
--------------------------------

In [None]:
import pickle

# Save Random Forest model
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

# Save SVM model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(svm_model, file)

# Save Naive Bayes model
with open('nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Save Label Encoder
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

print("Models and vectorizer saved successfully!")


Models and vectorizer saved successfully!


In [None]:
from google.colab import files

files.download('rf_model.pkl')
files.download('svm_model.pkl')
files.download('nb_model.pkl')
files.download('tfidf_vectorizer.pkl')
files.download('label_encoder.pkl')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>