In [2]:
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import make_pipeline
import pickle

In [3]:
df = pd.read_csv("combined_emails_with_natural_pii.csv")

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

df["language"] = df["email"].apply(detect_language)

df=df[df["language"]=='en']
df = df.reset_index(drop=True)

In [4]:
# Define regex patterns for PII masking
patterns = {
    "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
    "phone_number": r"\b(?:\+91[-\s]?|0)?[6-9]\d{9}\b",
    "dob": r"\b(?:\d{1,2}[-/\s])?(?:\d{1,2}[-/\s])?(?:\d{2,4})\b",
    "aadhar_num": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
    "credit_debit_no": r"\b(?:\d{4}[\s-]?){3}\d{4}\b",
    "cvv_no": r"\b\d{3}\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])/\d{2,4}\b",
    "full_name": r"\b([A-Z][a-z]+\s[A-Z][a-z]+)\b"
}
def mask_text(text):
    for entity, pattern in patterns.items():
        text = re.sub(pattern, f"[{entity}]", text)
    return text

df["masked_email"] = df["email"].apply(mask_text)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_and_stem(text):
    tokens = nltk.word_tokenize(text)
    return " ".join([
        stemmer.stem(word.lower()) 
        for word in tokens 
        if word.isalpha() and word.lower() not in stop_words
    ])

df["processed_email"] = df["masked_email"].apply(clean_and_stem)

X = df["processed_email"]
y = df["type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)
pipeline.fit(X_train, y_train)

with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)