In [2]:
# install libraries
!pip install -q transformers[sentencepiece] datasets tqdm
import re, json, os, pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


In [3]:
from google.colab import files
uploaded = files.upload()                # pick combined_emails_with_natural_pii.csv
DF_PATH = next(iter(uploaded))           # gets the file name
df = pd.read_csv(DF_PATH)


Saving combined_emails_with_natural_pii.csv to combined_emails_with_natural_pii.csv


In [5]:
print(df.columns)


Index(['email', 'type'], dtype='object')


In [6]:
# Peek at class distribution
df['type'].value_counts()


Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
Incident,9586
Request,6860
Problem,5037
Change,2517


In [7]:
# Using a compact, English NER model (no LLMs involved)
MODEL_NAME = "dslim/bert-base-NER"
tokner  = AutoTokenizer.from_pretrained(MODEL_NAME)
NERmodel = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
ner = pipeline("ner", model=NERmodel, tokenizer=tokner,
               aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [8]:
REGEX_PATTERNS = {
    "email"          : r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
    "phone_number"   : r"\b(?:\+?91[-\s]?)?[6-9]\d{9}\b",
    "dob"            : r"\b(?:0?[1-9]|[12][0-9]|3[01])[/\-](?:0?[1-9]|1[012])[/\-](?:\d{2,4})\b",
    "aadhar_num"     : r"\b\d{4}\s?\d{4}\s?\d{4}\b",
    "credit_debit_no": r"\b(?:\d[ -]*?){13,16}\b",
    "cvv_no"         : r"\b\d{3,4}\b",
    "expiry_no"      : r"\b(?:0?[1-9]|1[012])/?\d{2,4}\b",
}


In [9]:
def mask_with_offsets(text, spans):
    """Replace every (start,end,label) span with [label] and return new text & mapping."""
    masked, cursor = [], 0
    for start, end, label in sorted(spans, key=lambda x: x[0]):
        masked.append(text[cursor:start])
        masked.append(f"[{label}]")
        cursor = end
    masked.append(text[cursor:])
    return "".join(masked)


In [10]:
def pii_mask(text: str):
    spans = []   # (start, end, classification)

    # 1) Regex entities
    for label, pat in REGEX_PATTERNS.items():
        for m in re.finditer(pat, text):
            spans.append((m.start(), m.end(), label))

    # 2) BERT-NER for PERSON → full_name
    for ent in ner(text):
        if ent["entity_group"] == "PER":
            spans.append((ent["start"], ent["end"], "full_name"))

    # drop overlaps (keep first come, first served)
    spans_no_overlap = []
    occupied = set()
    for s in sorted(spans, key=lambda x: x[0]):
        if any(i in occupied for i in range(s[0], s[1])):   # skip if clashes
            continue
        spans_no_overlap.append(s)
        occupied.update(range(s[0], s[1]))

    # produce masked text
    masked_text = mask_with_offsets(text, spans_no_overlap)

    # build mapping list with original text
    mapping = [{"position":[s,e],
                "classification":lbl,
                "entity":text[s:e]} for s,e,lbl in spans_no_overlap]

    return masked_text, mapping


In [14]:
from tqdm.notebook import tqdm

masked_emails, entity_maps = [], []

for body in tqdm(df['email']):
    masked, mapping = pii_mask(body)
    masked_emails.append(masked)
    entity_maps.append(mapping)

df['masked_email'] = masked_emails
df['list_of_masked_entities'] = entity_maps


  0%|          | 0/24000 [00:00<?, ?it/s]

In [15]:
MASKED_PATH = "masked_emails_only.csv"
df.drop(columns=['email']).to_csv(MASKED_PATH, index=False)


In [16]:
DEMAP_PATH = "private_demap.jsonl"
with open(DEMAP_PATH, "w") as f:
    for body, mapping in zip(df['email'], entity_maps):
        f.write(json.dumps({"original_email": body,
                            "list_of_masked_entities": mapping}) + "\n")


In [17]:
i = 0   # any row index
print("ORIGINAL:\n", df.loc[i,'email'], "\n")
print("MASKED:\n", df.loc[i,'masked_email'], "\n")
print("ENTITIES:\n", json.dumps(df.loc[i,'list_of_masked_entities'], indent=2))


ORIGINAL:
 Subject: Unvorhergesehener Absturz der Datenanalyse-Plattform

Die Datenanalyse-Plattform brach unerwartet ab, da die Speicheroberfläche zu gering war My name is Sophia Rossi.. Ich habe versucht, Laravel 8 und meinen MacBook Pro neu zu starten, aber das Problem behält sich bei. Ich benötige Ihre Unterstützung, um diesen Fehler zu beheben. You can reach me at janesmith@company.com. 

MASKED:
 Subject: Unvorhergesehener Absturz der Datenanalyse-Plattform

Die Datenanalyse-Plattform brach unerwartet ab, da die Speicheroberfläche zu gering war My name is [full_name].. Ich habe versucht, [full_name]vel 8 und meinen MacBook Pro neu zu starten, aber das Problem behält sich bei. Ich benötige Ihre Unterstützung, um diesen Fehler zu beheben. You can reach me at [email] 

ENTITIES:
 [
  {
    "position": [
      162,
      174
    ],
    "classification": "full_name",
    "entity": "Sophia Rossi"
  },
  {
    "position": [
      196,
      200
    ],
    "classification": "full_name",


In [31]:
# Install just in case
!pip install -q scikit-learn pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [33]:


from google.colab import drive
drive.mount('/content/drive')

# Load your saved masked dataset
df = pd.read_csv("/content/masked_emails_only.csv")  # update path if needed
df = df[['masked_email', 'type']].dropna()
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,masked_email,type
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request
2,Subject: Data Analytics for Investment\n\nI am...,Request
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident
4,"Subject: Security\n\nDear Customer Support, I ...",Request


In [34]:
from sklearn.model_selection import train_test_split

X = df['masked_email']
y = df['type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)


In [36]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)


In [37]:
from sklearn.metrics import classification_report

y_pred = rf_model.predict(X_val_tfidf)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

      Change       0.95      0.63      0.76       504
    Incident       0.65      0.97      0.78      1917
     Problem       0.77      0.13      0.22      1007
     Request       0.87      0.92      0.89      1372

    accuracy                           0.74      4800
   macro avg       0.81      0.66      0.66      4800
weighted avg       0.77      0.74      0.69      4800



In [38]:
import joblib

MODEL_DIR = "/content/drive/MyDrive/rf_email_classifier"
!mkdir -p $MODEL_DIR

joblib.dump(vectorizer, f"{MODEL_DIR}/vectorizer.pkl")
joblib.dump(rf_model,   f"{MODEL_DIR}/rf_model.pkl")


['/content/drive/MyDrive/rf_email_classifier/rf_model.pkl']