In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import ast

In [None]:
data = pd.read_csv('/content/combined_emails_with_natural_pii.csv')

In [None]:
data.head()

Unnamed: 0,email,type
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request
2,Subject: Data Analytics for Investment\n\nI am...,Request
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident
4,"Subject: Security\n\nDear Customer Support, I ...",Request


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   24000 non-null  object
 1   type    24000 non-null  object
dtypes: object(2)
memory usage: 375.1+ KB


In [None]:
data.isnull().sum()

Unnamed: 0,0
email,0
type,0


In [None]:
data['email'][1]

'Subject: Customer Support Inquiry\n\nSeeking information on digital strategies that can aid in brand growth and details on the available services. Looking forward to learning more to help our business grow My name is Elena Ivanova.. Thank you, and I look forward to hearing from you soon. You can reach me at fatima.farsi@help.com.'

In [None]:
data.head()

Unnamed: 0,email,type
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request
2,Subject: Data Analytics for Investment\n\nI am...,Request
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident
4,"Subject: Security\n\nDear Customer Support, I ...",Request


In [None]:
import pandas as pd
import spacy
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# 1. Define the PII masking function
def mask_pii_spacy(text):
    masked_text = text
    masked_entities = []

    regex_patterns = {
        "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        "phone_number": r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b',
        "credit_debit_no": r'\b(?:\d[ -]*?){13,16}\b',
        "aadhar_num": r'\d{4}\s\d{4}\s\d{4}',
        "cvv_no": r'\b\d{3,4}\b',
        "expiry_no": r'\b(0[1-9]|1[0-2])/?([0-9]{2})\b',
        "dob": r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
    }

    for entity_type, pattern in regex_patterns.items():
        for match in re.finditer(pattern, masked_text):
            start, end = match.span()
            original_entity = match.group(0)
            masked_text = masked_text[:start] + f"[{entity_type}]" + masked_text[end:]
            masked_entities.append({
                "entity": original_entity,
                "classification": entity_type,
                "position": [start, end]
            })
            offset = len(f"[{entity_type}]") - len(original_entity)
            for i in range(len(masked_entities)-1):
                if masked_entities[i]["position"][0] > start:
                    masked_entities[i]["position"] = [
                        masked_entities[i]["position"][0] + offset,
                        masked_entities[i]["position"][1] + offset
                    ]

    doc = nlp(masked_text)

    spacy_entity_mapping = {
        "PERSON": "full_name",
        "ORG": "organization",
        "GPE": "location",
        "DATE": "dob"
    }

    for ent in doc.ents:
        if ent.label_ in spacy_entity_mapping:
            start, end = ent.start_char, ent.end_char
            original_entity = ent.text
            label = spacy_entity_mapping[ent.label_]
            masked_text = masked_text[:start] + f"[{label}]" + masked_text[end:]
            masked_entities.append({
                "entity": original_entity,
                "classification": label,
                "position": [start, end]
            })
            offset = len(f"[{label}]") - len(original_entity)
            for i in range(len(masked_entities)-1):
                if masked_entities[i]["position"][0] > start:
                    masked_entities[i]["position"] = [
                        masked_entities[i]["position"][0] + offset,
                        masked_entities[i]["position"][1] + offset
                    ]

    return masked_text, masked_entities

# 2. Define the text preprocessing function
def text_preprocessing(data, text_column='masked_text'):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        text = text.lower()

        # Step 1: Extract [masked_tokens] to preserve
        tokens = re.findall(r'\[[^\]]+\]', text)
        placeholders = {token: f"__TOKEN_{i}__" for i, token in enumerate(tokens)}

        # Step 2: Temporarily replace them
        for token, placeholder in placeholders.items():
            text = text.replace(token, placeholder)

        # Step 3: Remove unwanted characters from rest of text
        text = re.sub(r'[^a-z\s]', '', text)

        # Step 4: Restore original tokens
        for token, placeholder in placeholders.items():
            text = text.replace(placeholder, token)

        # Step 5: Tokenize, remove stopwords, and lemmatize
        words = text.split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 1]
        return " ".join(words)

    data[text_column] = data[text_column].apply(clean_text)
    return data

# 4. Apply PII Masking
def apply_masking(row):
    masked_text, masked_entities = mask_pii_spacy(row["email"])
    return pd.Series([masked_text, masked_entities])

data[["masked_text", "masked_entities"]] = data.apply(apply_masking, axis=1)

# 5. Apply Text Preprocessing on masked text
data = text_preprocessing(data, text_column='masked_text')

# 6. Final Data
print(data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                   email      type  \
0      Subject: Unvorhergesehener Absturz der Datenan...  Incident   
1      Subject: Customer Support Inquiry\n\nSeeking i...   Request   
2      Subject: Data Analytics for Investment\n\nI am...   Request   
3      Subject: Krankenhaus-Dienstleistung-Problem\n\...  Incident   
4      Subject: Security\n\nDear Customer Support, I ...   Request   
...                                                  ...       ...   
23995  Subject: Problem mit der HP DeskJet 3755 WLAN-...   Problem   
23996  Subject: Problemas com a impressora HP DeskJet...  Incident   
23997  Subject: Problema urgente con el envío\n\nEsti...  Incident   
23998  Subject: \n\nCher Service Client, nous rencont...  Incident   
23999  Subject: Overcharge on yearly Microsoft Office...   Problem   

                                             masked_text  \
0      subject unvorhergesehener attform brach unerwa...   
1      subject customer support inquiry

In [None]:
data['email'][1]

'Subject: Customer Support Inquiry\n\nSeeking information on digital strategies that can aid in brand growth and details on the available services. Looking forward to learning more to help our business grow My name is Elena Ivanova.. Thank you, and I look forward to hearing from you soon. You can reach me at fatima.farsi@help.com.'

In [None]:
print(data.columns)

Index(['email', 'type', 'masked_text', 'masked_entities'], dtype='object')


In [None]:
data.head()

Unnamed: 0,email,type,masked_text,masked_entities
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident,subject unvorhergesehener attform brach unerwa...,"[{'entity': 'janesmith@company.com', 'classifi..."
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request,subject customer support inquiry seeking infor...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."
2,Subject: Data Analytics for Investment\n\nI am...,Request,subject contacting request information data an...,"[{'entity': 'liuwei@business.cn', 'classificat..."
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident,subject ein mediendatensperrbten zugriffes auf...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."
4,"Subject: Security\n\nDear Customer Support, I ...",Request,subject security dear customer support reachin...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."


In [None]:
data['masked_text'][1]

'subject customer support inquiry seeking information digital strategy aid brand growth detail available service looking forward learning help business grow name thank look forward hearing soon reach'

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [None]:
def add_keyword_features(df, text_column="masked_text"):
    request_keywords = ["please", "kindly", "request", "could you", "would you"]
    problem_keywords = ["issue", "error", "problem", "failed", "unable"]
    change_keywords = ["change", "update", "modify", "replace", "upgrade"]

    def check_keywords(text, keywords):
        return int(any(word in text.lower() for word in keywords))

    df['has_request_words'] = df[text_column].apply(lambda x: check_keywords(x, request_keywords))
    df['has_problem_words'] = df[text_column].apply(lambda x: check_keywords(x, problem_keywords))
    df['has_change_words'] = df[text_column].apply(lambda x: check_keywords(x, change_keywords))

    return df


In [None]:
from sentence_transformers import SentenceTransformer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode masked_text into sentence embeddings
email_embeddings = sbert_model.encode(data['masked_text'].tolist(), batch_size=32, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
x_final = email_embeddings

In [None]:
data.head()

Unnamed: 0,email,type,masked_text,masked_entities
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident,subject unvorhergesehener attform brach unerwa...,"[{'entity': 'janesmith@company.com', 'classifi..."
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request,subject customer support inquiry seeking infor...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."
2,Subject: Data Analytics for Investment\n\nI am...,Request,subject contacting request information data an...,"[{'entity': 'liuwei@business.cn', 'classificat..."
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident,subject ein mediendatensperrbten zugriffes auf...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."
4,"Subject: Security\n\nDear Customer Support, I ...",Request,subject security dear customer support reachin...,"[{'entity': 'fatima.farsi@help.com', 'classifi..."


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,precision_score,accuracy_score
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['type']=label_encoder.fit_transform(data['type'])
# Target variable
y = data['type']

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_final, y, test_size=0.2, random_state=42, stratify=y)

# Model
clf = LogisticRegression(max_iter=2000)
clf.fit(x_train, y_train)

# Predictions
y_pred = clf.predict(x_test)

# Evaluation
#print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision}")
print(f"accuracy: {accuracy}")

Precision: 0.6889673322363742
accuracy: 0.7041666666666667


In [None]:
def predict_single_email(email_text, model, label_encoder):


    # Create a dummy DataFrame for one email
    email_df = pd.DataFrame({'email': [email_text]})

    # 1. Apply PII Masking
    email_df[["masked_text", "masked_entities"]] = email_df.apply(apply_masking, axis=1)

    # 2. Text Preprocessing
    email_df = text_preprocessing(email_df, text_column="masked_text")

    # 3. Tokenization (split words) - this step is not needed for SentenceTransformer
    # email_df["tokens"] = email_df["masked_text"].apply(lambda x: x.split())

    # 4. Sentence Embedding (if you are using SentenceTransformer)
    from sentence_transformers import SentenceTransformer
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # or the one you trained
    sentence_embeddings = sentence_model.encode(email_df["masked_text"].tolist())
    final_features =sentence_embeddings

    # 5. Prediction
    y_pred = model.predict(final_features)  # Use final_features

    # 6. Decode label

    predicted_label = label_encoder.inverse_transform(y_pred)[0]

    return {
        #"original_email": email_text,
        #"masked_email": email_df["masked_text"].iloc[0],
        #"masked_entities": email_df["masked_entities"].iloc[0],
        "predicted_category": predicted_label
    }

In [None]:
email_text = """Request to Update My Contact Information

Hi,

I recently changed my phone number and email address. Could you please update my records with the following details?

New Email: john.doe@example.com
New Phone Number: +1 (555) 123-4567

Let me know if you need anything else from my side.

Thanks,
John Doe
"""

result = predict_single_email(email_text, model=clf, label_encoder=label_encoder)

print(result)


{'predicted_category': 'Change'}


In [None]:
import pickle

# Save your trained classifier model
with open('email_classifier_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Save your label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("✅ Model and Label Encoder saved successfully!")


✅ Model and Label Encoder saved successfully!
