In [None]:
# 🧠 Auto Tagging Support Tickets Using LLM (Free Version)
# -----------------------------------------------------------
# Objective: Automatically tag support tickets into categories using LLMs (FREE)
# Dataset: Free-text Support Ticket Dataset from Kaggle
# -----------------------------------------------------------

# ✅ Step 1: Kaggle API Setup
import os
from google.colab import files

# Check if kaggle.json exists
if not os.path.exists("/root/.kaggle/kaggle.json"):
    print("⚠️ kaggle.json not found. Please upload your Kaggle API key file.")
    uploaded = files.upload()  # Prompts user to upload kaggle.json
    os.makedirs("/root/.kaggle", exist_ok=True)
    for fname in uploaded.keys():
        os.rename(fname, "/root/.kaggle/kaggle.json")
    os.chmod("/root/.kaggle/kaggle.json", 0o600)
    print("✅ Kaggle API key saved successfully.")
else:
    print("✅ kaggle.json already exists. Continuing...")

# Test Kaggle API
!kaggle datasets list -s "support ticket" | head -n 10

# -----------------------------------------------------------
# ✅ Step 2: Download Dataset
!kaggle datasets download -d jutrera/customer-support-tickets -p /content --unzip

# -----------------------------------------------------------
# ✅ Step 3: Load & Inspect Dataset
import pandas as pd

# Load dataset
df = pd.read_csv('/content/customer_support_tickets.csv')
print("Rows:", len(df))
df.head()

# Basic cleanup
df = df.dropna(subset=['text', 'category'])
df = df.reset_index(drop=True)

# -----------------------------------------------------------
# ✅ Step 4: Zero-Shot Classification (Free via Hugging Face)
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Candidate labels — take top 10 unique categories
candidate_labels = list(df['category'].unique())[:10]

# Test on few examples
for i in range(3):
    text = df['text'][i]
    result = classifier(text, candidate_labels, multi_label=True)
    top3 = list(zip(result['labels'][:3], result['scores'][:3]))
    print(f"\nTICKET: {text}\nTOP 3 TAGS:")
    for tag, score in top3:
        print(f"  - {tag} ({score:.2f})")

# -----------------------------------------------------------
# ✅ Step 5: Fine-Tuning a Transformer Model
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Prepare data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

# Label encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_enc = label_encoder.fit_transform(train_labels)
val_labels_enc = label_encoder.transform(val_labels)

# Create dataset for Hugging Face
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

dataset = Dataset.from_dict({
    'text': list(train_texts) + list(val_texts),
    'labels': list(train_labels_enc) + list(val_labels_enc)
})

tokenized_dataset = dataset.map(tokenize, batched=True)
train_dataset = tokenized_dataset.select(range(len(train_texts)))
val_dataset = tokenized_dataset.select(range(len(train_texts), len(dataset)))

# Model setup
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# -----------------------------------------------------------
# ✅ Step 6: Evaluate and Output Top 3 Predictions
import torch

texts = val_texts[:5]
inputs = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True)
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

for i, text in enumerate(texts):
    top3 = torch.topk(probs[i], 3)
    labels = [label_encoder.inverse_transform([idx.item()])[0] for idx in top3.indices]
    scores = [round(val.item(), 2) for val in top3.values]
    print(f"\nTICKET: {text}\nTOP 3 TAGS:")
    for label, score in zip(labels, scores):
        print(f"  - {label} ({score})")
