In [1]:
!pip install pandas scikit-learn




In [2]:
import pandas as pd
from io import StringIO

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


In [3]:
csv_data = """email_id,customer_id,subject,body,tag
1,CUST_A,"Unable to access shared mailbox","Hi team, I'm unable to access the shared
mailbox for our support team. It keeps showing a permissions error. Can you please
check?","access_issue"
2,CUST_A,"Rules not working","We created a rule to auto-assign emails based on subject line
but it stopped working since yesterday.","workflow_issue"
3,CUST_A,"Email stuck in pending","One of our emails is stuck in pending even after marking it
resolved. Not sure what’s happening.","status_bug"
4,CUST_B,"Automation creating duplicate tasks","Your automation engine is creating 2 tasks for
every email. This started after we edited our workflow.","automation_bug"
5,CUST_B,"Tags missing","Many of our tags are not appearing for new emails. Looks like the
tagging model is not working for us.","tagging_issue"
6,CUST_B,"Billing query","We were charged incorrectly this month. Need a corrected
invoice.","billing"
7,CUST_C,"CSAT not visible","CSAT scores disappeared from our dashboard today. Is there an
outage?","analytics_issue"
8,CUST_C,"Delay in email loading","Opening a conversation takes 8–10 seconds. This is
affecting our productivity.","performance"
9,CUST_C,"Need help setting up SLAs","We want to configure SLAs for different customer tiers.
Can someone guide us?","setup_help"
10,CUST_D,"Mail merge failing","Mail merge is not sending emails even though the CSV is
correct.","mail_merge_issue"
11,CUST_D,"Can't add new user","Trying to add a new team member but getting an
'authorization required' error.","user_management"
12,CUST_D,"Feature request: Dark mode","Dark mode would help during late-night support
hours. Please consider this.","feature_request"
"""

df = pd.read_csv(StringIO(csv_data))
df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")
df


Unnamed: 0,email_id,customer_id,subject,body,tag,text
0,1,CUST_A,Unable to access shared mailbox,"Hi team, I'm unable to access the shared \nmai...",access_issue,"Unable to access shared mailbox Hi team, I'm u..."
1,2,CUST_A,Rules not working,We created a rule to auto-assign emails based ...,workflow_issue,Rules not working We created a rule to auto-as...
2,3,CUST_A,Email stuck in pending,One of our emails is stuck in pending even aft...,status_bug,Email stuck in pending One of our emails is st...
3,4,CUST_B,Automation creating duplicate tasks,Your automation engine is creating 2 tasks for...,automation_bug,Automation creating duplicate tasks Your autom...
4,5,CUST_B,Tags missing,Many of our tags are not appearing for new ema...,tagging_issue,Tags missing Many of our tags are not appearin...
5,6,CUST_B,Billing query,We were charged incorrectly this month. Need a...,billing,Billing query We were charged incorrectly this...
6,7,CUST_C,CSAT not visible,CSAT scores disappeared from our dashboard tod...,analytics_issue,CSAT not visible CSAT scores disappeared from ...
7,8,CUST_C,Delay in email loading,Opening a conversation takes 8–10 seconds. Thi...,performance,Delay in email loading Opening a conversation ...
8,9,CUST_C,Need help setting up SLAs,We want to configure SLAs for different custom...,setup_help,Need help setting up SLAs We want to configure...
9,10,CUST_D,Mail merge failing,Mail merge is not sending emails even though t...,mail_merge_issue,Mail merge failing Mail merge is not sending e...


In [4]:
models = {}  # customer_id -> sklearn pipeline

for cust_id, df_cust in df.groupby("customer_id"):
    X = df_cust["text"]
    y = df_cust["tag"]

    # TF-IDF + Naive Bayes baseline
    model = make_pipeline(
        TfidfVectorizer(),
        MultinomialNB()
    )
    model.fit(X, y)
    models[cust_id] = model

models.keys()


dict_keys(['CUST_A', 'CUST_B', 'CUST_C', 'CUST_D'])

In [5]:
def predict_tag(customer_id, subject, body):
    text = (subject or "") + " " + (body or "")

    if customer_id not in models:
        raise ValueError(f"No model available for customer_id={customer_id}")

    model = models[customer_id]
    pred = model.predict([text])[0]

    # Optional confidence
    try:
        proba = model.predict_proba([text]).max()
    except AttributeError:
        proba = None

    return pred, proba


In [6]:
pred, proba = predict_tag(
    "CUST_B",
    "Invoice problem",
    "Hello, we were overcharged on this month’s invoice, please help fix billing."
)
pred, proba


(np.str_('billing'), np.float64(0.47361206083647406))

In [7]:
PATTERN_RULES = {
    "access_issue": ["access", "permission", "permissions"],
    "workflow_issue": ["rule", "auto-assign", "workflow"],
    "status_bug": ["pending", "resolved"],

    "automation_bug": ["automation", "duplicate", "2 tasks"],
    "tagging_issue": ["tag", "tags", "tagging"],
    "billing": ["invoice", "charged", "billing"],

    "analytics_issue": ["CSAT", "dashboard", "scores"],
    "performance": ["delay", "seconds", "loading", "slow"],
    "setup_help": ["SLA", "SLAs", "configure", "setup"],

    "mail_merge_issue": ["mail merge", "CSV"],
    "user_management": ["add new user", "team member", "authorization"],
    "feature_request": ["feature request", "would help", "please consider"]
}


In [8]:
import re

def apply_pattern_rules(text, customer_tags):
    """
    text: full email text
    customer_tags: allowed tags for this customer_id
    """
    text_lower = text.lower()

    for tag, keywords in PATTERN_RULES.items():
        if tag not in customer_tags:
            continue
        for kw in keywords:
            # simple contains match (case insensitive)
            if kw.lower() in text_lower:
                return tag
    return None


In [9]:
def predict_tag_with_rules(customer_id, subject, body):
    text = (subject or "") + " " + (body or "")

    if customer_id not in models:
        raise ValueError(f"No model available for customer_id={customer_id}")

    # Allowed tags for this customer
    customer_tags = set(df[df["customer_id"] == customer_id]["tag"].unique())

    # 1️⃣ Try rule-based pattern first
    rule_tag = apply_pattern_rules(text, customer_tags)
    if rule_tag is not None:
        return rule_tag, "rule_based"

    # 2️⃣ Fall back to ML
    model = models[customer_id]
    pred = model.predict([text])[0]
    return pred, "ml_model"


In [10]:
CUSTOM_STOP_WORDS = ["issue", "problem", "help", "error", "support", "email"]

models = {}  # rebuild models with new vectorizer

for cust_id, df_cust in df.groupby("customer_id"):
    X = df_cust["text"]
    y = df_cust["tag"]

    model = make_pipeline(
        TfidfVectorizer(stop_words=CUSTOM_STOP_WORDS),
        MultinomialNB()
    )
    model.fit(X, y)
    models[cust_id] = model


In [11]:
def predict_tag_with_guardrails(customer_id, subject, body, confidence_threshold=0.5):
    text = (subject or "") + " " + (body or "")

    if customer_id not in models:
        raise ValueError(f"No model available for customer_id={customer_id}")

    customer_tags = set(df[df["customer_id"] == customer_id]["tag"].unique())

    # 1) Pattern rules
    rule_tag = apply_pattern_rules(text, customer_tags)
    if rule_tag is not None:
        return rule_tag, "rule_based"

    # 2) ML model
    model = models[customer_id]
    pred = model.predict([text])[0]

    # 3) Confidence check
    try:
        proba = model.predict_proba([text]).max()
    except AttributeError:
        proba = None

    if proba is not None and proba < confidence_threshold:
        return "needs_manual_review", f"low_confidence({proba:.2f})"

    return pred, f"ml_model_conf({proba:.2f})" if proba is not None else "ml_model"


In [12]:
from sklearn.metrics import classification_report

def evaluate_per_customer():
    for cust_id, df_cust in df.groupby("customer_id"):
        print("Customer:", cust_id)
        y_true = []
        y_pred = []

        rows = df_cust.reset_index(drop=True)
        for i in range(len(rows)):
            # train on all except i
            train = rows.drop(index=i)
            test_row = rows.iloc[i]

            model = make_pipeline(
                TfidfVectorizer(stop_words=CUSTOM_STOP_WORDS),
                MultinomialNB()
            )
            model.fit(train["text"], train["tag"])

            pred = model.predict([test_row["text"]])[0]
            y_true.append(test_row["tag"])
            y_pred.append(pred)

        print(classification_report(y_true, y_pred))
        print("-" * 40)

evaluate_per_customer()


Customer: CUST_A
                precision    recall  f1-score   support

  access_issue       0.00      0.00      0.00       1.0
    status_bug       0.00      0.00      0.00       1.0
workflow_issue       0.00      0.00      0.00       1.0

      accuracy                           0.00       3.0
     macro avg       0.00      0.00      0.00       3.0
  weighted avg       0.00      0.00      0.00       3.0

----------------------------------------
Customer: CUST_B
                precision    recall  f1-score   support

automation_bug       0.00      0.00      0.00       1.0
       billing       0.00      0.00      0.00       1.0
 tagging_issue       0.00      0.00      0.00       1.0

      accuracy                           0.00       3.0
     macro avg       0.00      0.00      0.00       3.0
  weighted avg       0.00      0.00      0.00       3.0

----------------------------------------
Customer: CUST_C
                 precision    recall  f1-score   support

analytics_issue    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
def debug_predictions():
    for _, row in df.iterrows():
        pred, info = predict_tag_with_guardrails(
            row["customer_id"],
            row["subject"],
            row["body"]
        )
        print(f"Email {row['email_id']} | Customer = {row['customer_id']}")
        print("Text:", row["text"])
        print("True tag:", row["tag"], " | Predicted:", pred, " | Info:", info)
        print("-" * 80)

debug_predictions()


Email 1 | Customer = CUST_A
Text: Unable to access shared mailbox Hi team, I'm unable to access the shared 
mailbox for our support team. It keeps showing a permissions error. Can you please 
check?
True tag: access_issue  | Predicted: access_issue  | Info: rule_based
--------------------------------------------------------------------------------
Email 2 | Customer = CUST_A
Text: Rules not working We created a rule to auto-assign emails based on subject line 
but it stopped working since yesterday.
True tag: workflow_issue  | Predicted: workflow_issue  | Info: rule_based
--------------------------------------------------------------------------------
Email 3 | Customer = CUST_A
Text: Email stuck in pending One of our emails is stuck in pending even after marking it 
resolved. Not sure what’s happening.
True tag: status_bug  | Predicted: status_bug  | Info: rule_based
--------------------------------------------------------------------------------
Email 4 | Customer = CUST_B
Text: Auto