# Part A — Email Tagging Mini-System

## 1. Load Dataset

In [1]:
import re, json
from collections import defaultdict

SMALL_DATA = [
    (1, 'CUST_A', 'Unable to access shared mailbox',
     "Hi team, I'm unable to access the shared mailbox. It shows permissions error.", 'access_issue'),
    (2, 'CUST_A', 'Rules not working', 'Our auto-assignment rule stopped working.', 'workflow_issue'),
    (3, 'CUST_A', 'Email stuck in pending', 'Email stuck in pending.', 'status_bug'),
    (4, 'CUST_B', 'Automation creating duplicate tasks', 'Automation creates duplicate tasks.', 'automation_bug'),
    (5, 'CUST_B', 'Tags missing', 'Tags are not appearing.', 'tagging_issue'),
    (6, 'CUST_B', 'Billing query', 'Incorrect billing this month.', 'billing'),
    (7, 'CUST_C', 'CSAT not visible', 'CSAT disappeared from dashboard.', 'analytics_issue'),
    (8, 'CUST_C', 'Delay in email loading', 'Email loading takes 8-10 secs.', 'performance'),
    (9, 'CUST_C', 'Need help setting SLAs', 'Need help configuring SLAs.', 'setup_help'),
    (10, 'CUST_D', 'Mail merge failing', 'Mail merge not sending.', 'mail_merge_issue'),
    (11, 'CUST_D', "Can't add new user", 'Authorization error adding user.', 'user_management'),
    (12, 'CUST_D', 'Feature request: Dark mode', 'Dark mode requested.', 'feature_request'),
]
SMALL_DATA

[(1,
  'CUST_A',
  'Unable to access shared mailbox',
  "Hi team, I'm unable to access the shared mailbox. It shows permissions error.",
  'access_issue'),
 (2,
  'CUST_A',
  'Rules not working',
  'Our auto-assignment rule stopped working.',
  'workflow_issue'),
 (3,
  'CUST_A',
  'Email stuck in pending',
  'Email stuck in pending.',
  'status_bug'),
 (4,
  'CUST_B',
  'Automation creating duplicate tasks',
  'Automation creates duplicate tasks.',
  'automation_bug'),
 (5, 'CUST_B', 'Tags missing', 'Tags are not appearing.', 'tagging_issue'),
 (6, 'CUST_B', 'Billing query', 'Incorrect billing this month.', 'billing'),
 (7,
  'CUST_C',
  'CSAT not visible',
  'CSAT disappeared from dashboard.',
  'analytics_issue'),
 (8,
  'CUST_C',
  'Delay in email loading',
  'Email loading takes 8-10 secs.',
  'performance'),
 (9,
  'CUST_C',
  'Need help setting SLAs',
  'Need help configuring SLAs.',
  'setup_help'),
 (10,
  'CUST_D',
  'Mail merge failing',
  'Mail merge not sending.',
  'mail_

## 2. Preprocessing

In [2]:
import re
from collections import defaultdict

def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^a-z0-9 '?!.,-]", " ", s)
    return s.strip()

def combine_subject_body(subj, body):
    return f"{subj}. {body}"

rows = []
for eid, cid, subj, body, tag in SMALL_DATA:
    rows.append({'email_id': eid, 'customer_id': cid, 'text': normalize_text(combine_subject_body(subj, body)), 'tag': tag})

cust_examples = defaultdict(list)
for r in rows:
    cust_examples[r['customer_id']].append(r)

rows[:3]

[{'email_id': 1,
  'customer_id': 'CUST_A',
  'text': "unable to access shared mailbox. hi team, i'm unable to access the shared mailbox. it shows permissions error.",
  'tag': 'access_issue'},
 {'email_id': 2,
  'customer_id': 'CUST_A',
  'text': 'rules not working. our auto-assignment rule stopped working.',
  'tag': 'workflow_issue'},
 {'email_id': 3,
  'customer_id': 'CUST_A',
  'text': 'email stuck in pending. email stuck in pending.',
  'tag': 'status_bug'}]

## 3. TF-IDF + Logistic Regression Classifier

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import numpy as np

corpus = [r['text'] for r in rows]
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X = vectorizer.fit_transform(corpus)

le = LabelEncoder()
y = le.fit_transform([r['tag'] for r in rows])

model = LogisticRegression(max_iter=1000)
model.fit(X, y)

list(le.classes_)

## 4. Customer Isolation

In [None]:
customer_tags = defaultdict(set)
for r in rows:
    customer_tags[r['customer_id']].add(r['tag'])
customer_tags

defaultdict(set,
            {'CUST_A': {'access_issue', 'status_bug', 'workflow_issue'},
             'CUST_B': {'automation_bug', 'billing', 'tagging_issue'},
             'CUST_C': {'analytics_issue', 'performance', 'setup_help'},
             'CUST_D': {'feature_request',
              'mail_merge_issue',
              'user_management'}})

# 5. Pattern Guardrails -
*this acts as a safety measure that overrides or correct the ML model when it is likely to make a mistake.*

In [None]:
PATTERNS = {
    'CUST_A': [(r'access|permission denied', 'access_issue')],
    'CUST_B': [(r'duplicate|automation', 'automation_bug')],
    'CUST_C': [(r'csat', 'analytics_issue')],
    'CUST_D': [(r'dark mode', 'feature_request')],
}

def apply_patterns(customer_id, text):
    for pat, tag in PATTERNS.get(customer_id, []):
        if re.search(pat, text):
            return tag
    return None

def predict(customer_id, text):
    text = normalize_text(text)
    # patterns
    for pat, tag in PATTERNS.get(customer_id, []):
        if re.search(pat, text):
            return tag
    # model
    vec = vectorizer.transform([text])
    probs = model.predict_proba(vec)[0]
    classes = le.inverse_transform(np.arange(len(probs)))
    allowed = customer_tags[customer_id]
    masked = [(c,p) for c,p in zip(classes, probs) if c in allowed]
    if masked:
        return max(masked, key=lambda t: t[1])[0]
    return classes[probs.argmax()]

predict('CUST_A', 'cannot access mailbox')

'access_issue'

## 6. Leave-One-Out Evaluation

In [None]:
correct = 0
total = len(rows)

for r in rows:
    print(r["email_id"], "→", r["text"])
    temp = [x for x in rows if x['email_id'] != r['email_id']]

    vec = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
    Xtr = vec.fit_transform([x['text'] for x in temp])

    le2 = LabelEncoder()
    le2.fit([x['tag'] for x in temp])

    model2 = LogisticRegression(max_iter=500)
    model2.fit(Xtr, le2.transform([x['tag'] for x in temp]))

    # Allowed tags for this customer
    cust = r["customer_id"]
    allowed = {x["tag"] for x in temp if x["customer_id"] == cust}

    # Apply patterns first
    text = r["text"]
    pattern_hit = None
    for pat, tag in PATTERNS.get(cust, []):
        if re.search(pat, text):
            pattern_hit = tag
            break
    print(r["email_id"], pattern_hit)
    if pattern_hit:
        pred = pattern_hit
    else:
        # ML fallback with isolation
        xt = vec.transform([text])
        probs = model2.predict_proba(xt)[0]
        classes = le2.inverse_transform(np.arange(len(probs)))

        masked = [(c,p) for c,p in zip(classes, probs) if c in allowed]
        pred = max(masked, key=lambda t:t[1])[0] if masked else classes[probs.argmax()]

    if pred == r["tag"]:
        correct += 1

correct, correct/total


1 → unable to access shared mailbox. hi team, i'm unable to access the shared mailbox. it shows permissions error.
1 access_issue
2 → rules not working. our auto-assignment rule stopped working.
2 None
3 → email stuck in pending. email stuck in pending.
3 None
4 → automation creating duplicate tasks. automation creates duplicate tasks.
4 automation_bug
5 → tags missing. tags are not appearing.
5 None
6 → billing query. incorrect billing this month.
6 None
7 → csat not visible. csat disappeared from dashboard.
7 analytics_issue
8 → delay in email loading. email loading takes 8-10 secs.
8 None
9 → need help setting slas. need help configuring slas.
9 None
10 → mail merge failing. mail merge not sending.
10 None
11 → can't add new user. authorization error adding user.
11 None
12 → feature request  dark mode. dark mode requested.
12 feature_request


(4, 0.3333333333333333)

In [None]:
customer_tags


defaultdict(set,
            {'CUST_A': {'access_issue', 'status_bug', 'workflow_issue'},
             'CUST_B': {'automation_bug', 'billing', 'tagging_issue'},
             'CUST_C': {'analytics_issue', 'performance', 'setup_help'},
             'CUST_D': {'feature_request',
              'mail_merge_issue',
              'user_management'}})

## 7. Sample Predictions

In [None]:
tests = [
    ('CUST_A', 'permission'),
    ('CUST_B', 'automation creates duplicate tasks'),
    ('CUST_C', 'csat not visible'),
    ('CUST_D', 'mail merge not working'),
]
[(c,t,predict(c,t)) for c,t in tests]

[('CUST_A', 'permission', np.str_('access_issue')),
 ('CUST_B', 'automation creates duplicate tasks', 'automation_bug'),
 ('CUST_C', 'csat not visible', 'analytics_issue'),
 ('CUST_D', 'mail merge not working', np.str_('mail_merge_issue'))]

## 8. Improvement Ideas
1. Embedding-based classifier. Why ?

  Ans: TF-IDF vocabulary becomes extremely unstable when training data is very small (as shown in Leave-One-Out evaluation where removing one example collapses the model)


2. Active learning.
3. Tag ontology.
4. Add confidence scoring + fallback path

# 9. Optional
Using LLM Prompt-based classifier

In [None]:
def build_prompt(customer_id, subject, body):
    allowed = ', '.join(sorted(customer_tags[customer_id]))

    prompt = f"""
You are a helpful assistant that classifies support emails for a single customer.

Customer: {customer_id}
Allowed tags: {allowed}

Classify the following email into EXACTLY ONE of the allowed tags.

Return ONLY valid JSON with keys:
{{
 "tag": "string",
 "confidence": float between 0 and 1,
 "reason": "short explanation"
}}

Email Subject: {subject}
Email Body: {body}
"""
    return prompt


In [None]:
from google.colab import userdata
api_key = userdata.get("GEMINI_API_KEY")

In [None]:
import google.generativeai as genai

genai.configure(api_key=api_key)

In [None]:
import google.generativeai as genai
import json, os

# Configure Gemini
if "GEMINI_API_KEY" not in os.environ:
    raise Exception("Please set GEMINI_API_KEY environment variable")

genai.configure(api_key=os.environ["GEMINI_API_KEY"])


def predict_llm(customer_id, subject, body, mock=False, model_name="gemini-2.5-flash"):
    text = normalize_text(combine_subject_body(subject, body))

    # pattern guardrails
    if mock:
        p = apply_patterns(customer_id, text)
        if p:
            return p, 0.95, "matched high-precision pattern (mock)"
        allowed = sorted(list(customer_tags.get(customer_id, [])))
        return (allowed[0], 0.2, "mock fallback to first allowed tag") if allowed else ("unknown", 0.0, "no allowed tags")

    #REAL GEMINI CALL
    try:
        prompt = build_prompt(customer_id, subject, body)

        model = genai.GenerativeModel(model_name)
        result = model.generate_content(prompt)

        raw = result.text.strip()
        raw = raw.replace("```json", "").replace("```", "").strip()

        j = json.loads(raw)

        return (
            j.get("tag", "unknown"),
            float(j.get("confidence", 0.0)),
            j.get("reason", "")
        )

    except Exception as e:
        return "error", 0.0, f"LLM call failed: {e}"


# Example:
print(predict_llm("CUST_A", "Unable to access shared mailbox", "Permission denied", mock=False))

('access_issue', 1.0, "The email subject 'Unable to access' and body 'Permission denied' directly indicate a problem with gaining access due to permissions.")
