In [None]:
# SECTION 1: INSTALL REQUIRED LIBRARIES
!pip install -q textblob xlrd gradio

# SECTION 2: IMPORT LIBRARIES
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import gradio as gr

# SECTION 3: DOWNLOAD NLTK DATA
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# SECTION 4: LOAD DATA
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")
df.dropna(subset=["ticket_text", "issue_type", "urgency_level", "product"], inplace=True)

# SECTION 5: TEXT PREPROCESSING
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    tokens = word_tokenize(text)
    filtered = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(filtered)

df["clean_text"] = df["ticket_text"].apply(preprocess_text)

# SECTION 6: VECTORIZE TEXT
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df["clean_text"])

# SECTION 7: LABEL ENCODING
y_issue = df["issue_type"]
y_urgency = df["urgency_level"]

# SECTION 8: SPLIT DATA
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_issue, test_size=0.2, random_state=42)
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X, y_urgency, test_size=0.2, random_state=42)

# SECTION 9: TRAIN MODELS
issue_type_model = LogisticRegression(max_iter=1000)
urgency_level_model = LogisticRegression(max_iter=1000)
issue_type_model.fit(X_train_i, y_train_i)
urgency_level_model.fit(X_train_u, y_train_u)

# SECTION 10: DEFINE ENTITY EXTRACTOR
complaint_keywords = ["broken", "late", "error", "not working", "issue", "failed", "damaged", "delay", "problem", "missing"]

def extract_entities(text):
    product_matches = [prod for prod in df['product'].unique() if prod.lower() in text.lower()]
    date_matches = re.findall(r'\b(?:\d{1,2}[-/thstndrd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-/\s]?\d{2,4}\b', text, flags=re.IGNORECASE)
    complaint_found = [kw for kw in complaint_keywords if kw in text.lower()]
    return {
        "products": product_matches,
        "dates": date_matches,
        "complaint_keywords": complaint_found
    }

# SECTION 11: GRADIO INTERFACE

def analyze_ticket(text):
    cleaned_text = preprocess_text(text)
    vector = tfidf_vectorizer.transform([cleaned_text])
    issue_type_pred = issue_type_model.predict(vector)[0]
    urgency_level_pred = urgency_level_model.predict(vector)[0]
    entities = extract_entities(text)
    return issue_type_pred, urgency_level_pred, entities

gr_interface = gr.Interface(
    fn=analyze_ticket,
    inputs=gr.Textbox(lines=5, placeholder="Enter customer support ticket..."),
    outputs=[
        gr.Text(label="Predicted Issue Type"),
        gr.Text(label="Predicted Urgency Level"),
        gr.JSON(label="Extracted Entities")
    ],
    title="Customer Support Ticket Analyzer",
    description="Classifies support tickets and extracts useful entities like product, dates, and complaint keywords."
)

gr_interface.launch(debug=True)



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to C:\Users\ASHISH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\ASHISH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ASHISH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


In [None]:
!pip install datasets pandas nltk


In [None]:
import pandas as pd
import nltk
import re
from datasets import load_dataset

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
# Load dataset from Hugging Face
dataset = load_dataset("Abirate/english_quotes", split="train")

# Convert to DataFrame
df = pd.DataFrame(dataset)

# Text cleaning function
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

# Clean quotes
df["clean_quote"] = df["quote"].apply(clean_text)

# Drop missing rows
df.dropna(subset=["quote", "author", "tags"], inplace=True)

# Save cleaned dataset
df.to_csv("cleaned_quotes_dataset.csv", index=False)

# Show sample
df[["quote", "clean_quote", "author", "tags"]].head()
