<a href="https://colab.research.google.com/github/Sushma-20112/ai/blob/main/Knowledge_Engine_for_Smart_Support_Ticket_Resolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Run once
!pip install --quiet openai pandas nltk python-dotenv gspread oauth2client transformers torch


In [3]:
import os, re, json
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# optional: increase pandas display if needed
pd.set_option('display.max_colwidth', 200)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from google.colab import files

print("If you have a CSV (tickets_sample.csv) click Choose files and upload it. If not, we'll create a sample automatically.")
uploaded = files.upload()

# If user uploaded file, pick the first CSV file; otherwise create a sample CSV
if uploaded:
    # find first uploaded file that ends with .csv
    csv_name = None
    for fn in uploaded.keys():
        if fn.lower().endswith('.csv'):
            csv_name = fn
            break
    if csv_name is None:
        raise ValueError("No CSV found among uploaded files. Please upload a CSV.")
else:
    # create an example CSV
    csv_name = "tickets_sample.csv"
    sample = [
        {"ticket_id":1,"title":"Login issue","description":"Unable to login with my registered email"},
        {"ticket_id":2,"title":"Payment failure","description":"Payment not processing for premium plan"},
        {"ticket_id":3,"title":"App crash","description":"App crashes after update"},
        {"ticket_id":4,"title":"Password reset","description":"Forgot my password and reset link not working"},
        {"ticket_id":5,"title":"Subscription query","description":"Need to change subscription plan"}
    ]
    pd.DataFrame(sample).to_csv(csv_name, index=False)
    print(f"Sample CSV created: {csv_name}")

print("Using file:", csv_name)
df = pd.read_csv(csv_name)
print("Loaded rows:", len(df))
df.head()


If you have a CSV (tickets_sample.csv) click Choose files and upload it. If not, we'll create a sample automatically.


Saving tickets_sample.csv to tickets_sample.csv
Using file: tickets_sample.csv
Loaded rows: 5


Unnamed: 0,ticket_id,title,description
0,1,Login issue,Unable to login with my registered email
1,2,Payment failure,Payment not processing for premium plan
2,3,App crash,App crashes after update
3,4,Password reset,Forgot my password and reset link not working
4,5,Subscription query,Need to change subscription plan


In [5]:
print("Exact column names (as read):")
for col in df.columns:
    # show invisibles
    print(repr(col))


Exact column names (as read):
'ticket_id'
'title'
'description'


In [6]:
# normalize: trim, lowercase, replace spaces and non-word chars with underscore
clean_cols = [re.sub(r'\W+', '_', col.strip().lower()) for col in df.columns]
df.columns = clean_cols
print("Normalized columns:")
print(df.columns.tolist())


Normalized columns:
['ticket_id', 'title', 'description']


In [7]:
# candidate names to look for
candidates = ['ticket_description','description','issue','text','query','problem','body','message','details']
text_col = None
for cand in candidates:
    if cand in df.columns:
        text_col = cand
        break

# if none found, pick the first string-like column heuristically
if text_col is None:
    # prefer column with longest average string length (likely the description)
    str_cols = [c for c in df.columns if df[c].dtype == object]
    if str_cols:
        avg_lens = {c: df[c].dropna().astype(str).map(len).mean() for c in str_cols}
        text_col = max(avg_lens, key=avg_lens.get)
    else:
        raise ValueError("No text-like columns found in the CSV. Please check your file.")

print("Selected text column:", text_col)


Selected text column: description


In [8]:
def clean_text(s):
    s = "" if pd.isna(s) else str(s)
    s = s.lower()
    s = re.sub(r"http\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    tokens = [t for t in s.split() if t and t not in STOPWORDS]
    return " ".join(tokens)

df['clean_text'] = df[text_col].apply(clean_text)
df[['clean_text']].head()


Unnamed: 0,clean_text
0,unable login registered email
1,payment processing premium plan
2,app crashes update
3,forgot password reset link working
4,need change subscription plan


In [9]:
# ensure llm_tags column exists (will be filled later)
if 'llm_tags' not in df.columns:
    df['llm_tags'] = None

for i, row in df.iterrows():
    print(f"Ticket {i+1}: {row.get(text_col, 'N/A')}")
    print(" → clean_text:", row.get('clean_text', 'N/A'))
    print(" → Tags:", row.get('llm_tags', 'N/A'))
    print("-"*80)


Ticket 1: Unable to login with my registered email
 → clean_text: unable login registered email
 → Tags: None
--------------------------------------------------------------------------------
Ticket 2: Payment not processing for premium plan
 → clean_text: payment processing premium plan
 → Tags: None
--------------------------------------------------------------------------------
Ticket 3: App crashes after update
 → clean_text: app crashes update
 → Tags: None
--------------------------------------------------------------------------------
Ticket 4: Forgot my password and reset link not working
 → clean_text: forgot password reset link working
 → Tags: None
--------------------------------------------------------------------------------
Ticket 5: Need to change subscription plan
 → clean_text: need change subscription plan
 → Tags: None
--------------------------------------------------------------------------------


In [11]:
# OpenAI tagging function (will raise if quota or key issues)
openai_available = False
try:
    from openai import OpenAI
    # get key securely
    if 'OPENAI_API_KEY' not in os.environ:
        import getpass
        print("You can paste your key now (hidden). If you don't have a key or prefer not to, press Enter to skip.")
        k = getpass.getpass("OpenAI API key (press Enter to skip): ")
        if k:
            os.environ['OPENAI_API_KEY'] = k
    if os.environ.get('OPENAI_API_KEY'):
        client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
        # quick ping (lightweight) to check validity
        try:
            # small inexpensive request (if your account supports it)
            r = client.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user","content":"Hello"}], temperature=0)
            openai_available = True
            print("OpenAI key accepted — will use GPT for tagging.")
        except Exception as e:
            print("OpenAI test request failed (will fallback). Error:", type(e).__name__, str(e))
            openai_available = False
except Exception as e:
    print("OpenAI library/import not available or error:", type(e).__name__, e)
    openai_available = False


OpenAI test request failed (will fallback). Error: RateLimitError Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}


In [13]:
# Zero-shot using facebook/bart-large-mnli (suitable and lightweight)
from transformers import pipeline
zs = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# define a set of likely categories/tags you want to detect
candidate_labels = ["login issue", "payment failure", "app crash", "password reset", "subscription", "billing", "feature request", "bug", "account", "performance"]

def tag_with_zero_shot(text):
    try:
        out = zs(text, candidate_labels, multi_label=True)
        # produce a compact structure: top labels above threshold
        labels, scores = out['labels'], out['scores']
        tags = [lbl for lbl, sc in zip(labels, scores) if sc >= 0.4]  # threshold
        primary = labels[0] if labels else None
        return {"category": primary, "tags": tags, "scores": dict(zip(labels, scores))}
    except Exception as e:
        return {"category": None, "tags": [], "scores": {}}

# apply to whole DF (fast-ish)
df['llm_tags'] = df['clean_text'].apply(tag_with_zero_shot)
df[['clean_text','llm_tags']].head(10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Unnamed: 0,clean_text,llm_tags
0,unable login registered email,"{'category': 'login issue', 'tags': ['login issue', 'bug', 'account', 'password reset', 'performance', 'subscription'], 'scores': {'login issue': 0.9967553019523621, 'bug': 0.9800710678100586, 'ac..."
1,payment processing premium plan,"{'category': 'subscription', 'tags': ['subscription', 'billing', 'account'], 'scores': {'subscription': 0.9753769636154175, 'billing': 0.9396206140518188, 'account': 0.8342901468276978, 'performan..."
2,app crashes update,"{'category': 'app crash', 'tags': ['app crash', 'bug', 'performance', 'account'], 'scores': {'app crash': 0.9938802719116211, 'bug': 0.973405122756958, 'performance': 0.788218080997467, 'account':..."
3,forgot password reset link working,"{'category': 'password reset', 'tags': ['password reset', 'login issue', 'account', 'performance', 'feature request'], 'scores': {'password reset': 0.9438604712486267, 'login issue': 0.93456000089..."
4,need change subscription plan,"{'category': 'subscription', 'tags': ['subscription', 'login issue', 'billing', 'account', 'feature request', 'bug', 'payment failure'], 'scores': {'subscription': 0.9822330474853516, 'login issue..."


In [14]:
df.to_csv("tickets_tagged.csv", index=False)
from google.colab import files
files.download("tickets_tagged.csv")
print("Saved tickets_tagged.csv — download should begin.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved tickets_tagged.csv — download should begin.


In [15]:
# 1) Show exact normalized columns again
print(df.columns.tolist())

# 2) If some rows had null clean_text, show them
print(df[df['clean_text'].isna()])

# 3) Show sample tagged rows
display(df.head(10))


['ticket_id', 'title', 'description', 'clean_text', 'llm_tags']
Empty DataFrame
Columns: [ticket_id, title, description, clean_text, llm_tags]
Index: []


Unnamed: 0,ticket_id,title,description,clean_text,llm_tags
0,1,Login issue,Unable to login with my registered email,unable login registered email,"{'category': 'login issue', 'tags': ['login issue', 'bug', 'account', 'password reset', 'performance', 'subscription'], 'scores': {'login issue': 0.9967553019523621, 'bug': 0.9800710678100586, 'ac..."
1,2,Payment failure,Payment not processing for premium plan,payment processing premium plan,"{'category': 'subscription', 'tags': ['subscription', 'billing', 'account'], 'scores': {'subscription': 0.9753769636154175, 'billing': 0.9396206140518188, 'account': 0.8342901468276978, 'performan..."
2,3,App crash,App crashes after update,app crashes update,"{'category': 'app crash', 'tags': ['app crash', 'bug', 'performance', 'account'], 'scores': {'app crash': 0.9938802719116211, 'bug': 0.973405122756958, 'performance': 0.788218080997467, 'account':..."
3,4,Password reset,Forgot my password and reset link not working,forgot password reset link working,"{'category': 'password reset', 'tags': ['password reset', 'login issue', 'account', 'performance', 'feature request'], 'scores': {'password reset': 0.9438604712486267, 'login issue': 0.93456000089..."
4,5,Subscription query,Need to change subscription plan,need change subscription plan,"{'category': 'subscription', 'tags': ['subscription', 'login issue', 'billing', 'account', 'feature request', 'bug', 'payment failure'], 'scores': {'subscription': 0.9822330474853516, 'login issue..."
