In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
import joblib

# ------------------ PII Masking and Entity Tracking Function ------------------
def mask_pii_and_track(text):
    masked_text = text
    masked_entities = []
    patterns = {
        'full_name': r'\b([A-Z][a-z]+\s[A-Z][a-z]+)\b',
        'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        'phone_number': r'\b(\+91[-\s]?|0)?[6-9]\d{9}\b',
        'dob': r'\b(0?[1-9]|[12][0-9]|3[01])[- /.](0?[1-9]|1[012])[- /.](\d{4})\b',
        'aadhar_num': r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
        'credit_debit_no': r'\b(?:\d[ -]*?){13,16}\b',
        'cvv_no': r'\b\d{3}\b',
        'expiry_no': r'\b(0[1-9]|1[0-2])\/\d{2,4}\b'
    }

    for entity_type, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            start, end = match.span()
            original_entity = match.group(0)
            masked_entities.append({
                'position': [start, end],
                'classification': entity_type,
                'entity': original_entity
            })
            masked_text = re.sub(re.escape(original_entity), f'[{entity_type}]', masked_text, count=1)
s

# ------------------ Data Loading ------------------
def load_data(csv_file):
    return pd.read_csv(csv_file)

# ------------------ Subcategory Assignment ------------------
def assign_subcategory(text, category):
    text = text.lower()
    if category == 'Incident':
        if re.search(r'\b(software|application|program)\b', text):
            return 'Software Malfunction'
        elif re.search(r'\b(access|login|authentication)\b', text):
            return 'Access Issues'
        elif re.search(r'\b(error|failure|crash|bug)\b', text):
            return 'System Errors'
        elif re.search(r'\b(troubleshoot|fix
    return masked_text, masked_entitie|resolve|diagnose)\b', text):
            return 'Technical Troubleshooting'
        elif re.search(r'\b(data|information|corrupt|missing)\b', text):
            return 'Data Issues'
        else:
            return 'General Incident'
    elif category == 'Request':
        if re.search(r'\b(information|details|clarification|query)\b', text):
            return 'Request for Information'
        elif re.search(r'\b(account|profile|update|modify|change)\b', text):
            return 'Account/Profile Update'
        elif re.search(r'\b(integration|connect|api|access)\b', text):
            return 'Integration/Feature Access'
        elif re.search(r'\b(contact|phone|email|reach)\b', text):
            return 'Contact Request'
        else:
            return 'General Request'
    elif category == 'Problem':
        if re.search(r'\b(recurring|repeat|consistent)\b', text):
            return 'Recurring Technical Failure'
        elif re.search(r'\b(bug|glitch|defect)\b', text):
            return 'Bug/Glitches'
        elif re.search(r'\b(escalate|urgent|critical|priority)\b', text):
            return 'Escalated Issue'
        elif re.search(r'\b(data|database|loss|integrity)\b', text):
            return 'Data-Related Problem'
        else:
            return 'General Problem'
    elif category == 'Change':
        if re.search(r'\b(request|modify|adjust|alter|configure)\b', text):
            return 'Configuration Change Request'
        elif re.search(r'\b(tool|system|update|upgrade|patch)\b', text):
            return 'System/Tool Update'
        elif re.search(r'\b(enable|activate|disable|permission|role)\b', text):
            return 'Feature Enablement/Access Rights'
        elif re.search(r'\b(environment|setup|deployment|infrastructure)\b', text):
            return 'Environment/Setup Change'
        else:
            return 'General Change Request'
    return 'Uncategorized Subtype'

# ------------------ Training the Model ------------------
def train_model(data):
    data['masked_text'] = data['email'].apply(lambda x: mask_pii_and_track(x)[0])

    # Oversampling to balance classes
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(data[['masked_text']], data['type'])
    X_train, X_test, y_train, y_test = train_test_split(X_resampled['masked_text'], y_resampled, test_size=0.2, random_state=42)

    # Pipeline and GridSearchCV
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    param_grid = {
        'clf__n_estimators': [100, 150],
        'clf__max_depth': [None, 10, 20],
        'tfidf__max_df': [0.75, 1.0],
        'tfidf__min_df': [1, 3]
    }

    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='f1_weighted', verbose=1, n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return grid.best_estimator_

# ------------------ Email Classification ------------------
def classify_email(email_text, model):
    masked_email, masked_entities = mask_pii_and_track(email_text)
    predicted_label = model.predict([masked_email])[0]
    return masked_email, predicted_label

# ------------------ MAIN ------------------
if __name__ == "__main__":
    data = load_data('combined_emails_with_natural_pii.csv')
    model = train_model(data)

    sample_email = """
    Hello, I can’t access my account and the password reset link isn’t working.
    Please assist urgently. My email is test.user@company.com.
    """

    masked_email, category = classify_email(sample_email, model)
    subcategory = assign_subcategory(masked_email, category)

    print("\nMasked Email:\n", masked_email)
    print("Predicted Category:", category)
    print("Predicted Subcategory:", subcategory)




Fitting 3 folds for each of 24 candidates, totalling 72 fits
Classification Report:
               precision    recall  f1-score   support

      Change       0.98      0.99      0.98      1834
    Incident       0.80      0.91      0.85      2005
     Problem       0.91      0.79      0.85      1975
     Request       0.96      0.95      0.96      1855

    accuracy                           0.91      7669
   macro avg       0.91      0.91      0.91      7669
weighted avg       0.91      0.91      0.91      7669


Masked Email:
 
    Hello, I can’t access my account and the password reset link isn’t working.
    Please assist urgently. My email is [email].
    
Predicted Category: Incident
Predicted Subcategory: Access Issues
