In [1]:
pip install pandas numpy scikit-learn spacy joblib fastapi uvicorn python-multipart


Active code page: 1252Note: you may need to restart the kernel to use updated packages.



In [2]:
import pandas as pd
import numpy as np
import joblib
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [3]:
# Update with your correct dataset path
file_path = r"C:\Users\satya\EmailClassifierProject\email-classification\data\combined_emails_with_natural_pii.csv"
df = pd.read_csv(file_path)

print("Shape:", df.shape)
df.head()


Shape: (24000, 2)


Unnamed: 0,email,type
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request
2,Subject: Data Analytics for Investment\n\nI am...,Request
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident
4,"Subject: Security\n\nDear Customer Support, I ...",Request


In [None]:
def mask_pii(text):
    entity_list = []

    def record(match, entity_type):
        start, end = match.start(), match.end()
        entity_list.append({
            "position": [start, end],
            "classification": entity_type,
            "entity": match.group()
        })
        return f"[{entity_type}]"

    # Full Name: Capitalized First + Last name
    text = re.sub(r"\b[A-Z][a-z]+ [A-Z][a-z]+\b", lambda m: record(m, "full_name"), text)

    # Email
    text = re.sub(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z.-]+\.\w+\b", lambda m: record(m, "email"), text)

    # Phone number: 10-digit
    text = re.sub(r"\b\d{10}\b", lambda m: record(m, "phone_number"), text)

    # DOB: yyyy-mm-dd, dd-mm-yyyy, yyyy/mm/dd, dd/mm/yyyy
    text = re.sub(r"\b\d{4}[-/]\d{2}[-/]\d{2}\b", lambda m: record(m, "dob"), text)
    text = re.sub(r"\b\d{2}[-/]\d{2}[-/]\d{4}\b", lambda m: record(m, "dob"), text)

    # Aadhar number: 12 digits
    text = re.sub(r"\b\d{12}\b", lambda m: record(m, "aadhar_num"), text)

    # Credit/Debit Card: 16 digits with optional hyphens or spaces
    text = re.sub(r"\b(?:\d{4}[- ]?){4}\b", lambda m: record(m, "credit_debit_no"), text)

    # CVV: exactly 3 digits (avoid matching parts of phone/card)
    text = re.sub(r"\b\d{3}\b", lambda m: record(m, "cvv_no") if m.group() not in [e['entity'] for e in entity_list] else m.group(), text)

    # Expiry: MM/YY
    text = re.sub(r"\b(0[1-9]|1[0-2])/\d{2}\b", lambda m: record(m, "expiry_no"), text)

    return text, entity_list


In [12]:
# ✅ Rename columns to standard names used in the rest of the code
df = df.rename(columns={
    'email': 'email_body',
    'type': 'category'
})

# ✅ Now apply the masking function safely
df['masked_email'] = df['email_body'].apply(lambda x: mask_pii(x)[0])
df[['email_body', 'masked_email']].head()


Unnamed: 0,email_body,masked_email
0,Subject: Unvorhergesehener Absturz der Datenan...,Subject: [full_name] der Datenanalyse-Plattfor...
1,Subject: Customer Support Inquiry\n\nSeeking i...,Subject: [full_name] Inquiry\n\nSeeking inform...
2,Subject: Data Analytics for Investment\n\nI am...,Subject: [full_name] for Investment\n\nI am co...
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Subject: Krankenhaus-Dienstleistung-Problem\n\...
4,"Subject: Security\n\nDear Customer Support, I ...","Subject: Security\n\n[full_name] Support, I am..."


In [13]:
print(df.columns.tolist())


['email_body', 'category', 'masked_email']


In [14]:
X = df['masked_email']
y = df['category']

# Encode as string if needed
y = y.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

text_clf.fit(X_train, y_train)


In [16]:
y_pred = text_clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      Change       0.98      0.09      0.16       479
    Incident       0.61      0.99      0.75      1920
     Problem       0.33      0.01      0.02      1009
     Request       0.78      0.91      0.84      1392

    accuracy                           0.67      4800
   macro avg       0.68      0.50      0.44      4800
weighted avg       0.64      0.67      0.57      4800



In [17]:
model_path = r"C:\Users\satya\EmailClassifierProject\email-classification\saved_models\classifier_model.pkl"
joblib.dump(text_clf, model_path)

print(f"Model saved to: {model_path}")


Model saved to: C:\Users\satya\EmailClassifierProject\email-classification\saved_models\classifier_model.pkl
