# Task 1: Customer Support Ticket Classifier & Entity Extractor

This notebook addresses **Task 1** of the Vijayi Internship Assignment. The objective is to:
- Classify customer support tickets by **issue type** and **urgency level**
- Extract key entities (products, dates, complaint keywords)

We use classical NLP and ML techniques as instructed. No LLMs are used.


## 1. Data Loading and Cleaning

In [24]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

df = pd.read_excel(r"C:\Users\naray\OneDrive\Desktop\internship 3\ai_dev_assignment_tickets_complex_1000.xls")
df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'], inplace=True)
df = df.drop_duplicates(subset=['ticket_text'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naray\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naray\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naray\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\naray\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## 2. Preprocessing

In [25]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tagged = pos_tag(tokens)
    return " ".join([
        lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in tagged if w not in stop_words
    ])

df['clean_text'] = df['ticket_text'].apply(preprocess_text)


## 3. Entity Extraction and Feature Engineering

In [26]:
product_list = ["laptop", "phone", "charger", "headphones", "battery"]
complaint_keywords = ['broken', 'late', 'error', 'issue', 'crash', 'not working', 'damaged', 'fail']

def extract_entities(text):
    entities = {}
    text_lower = text.lower()
    entities['products'] = [p for p in product_list if p in text_lower]
    entities['dates'] = re.findall(r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})\b', text)
    entities['complaint_keywords'] = [kw for kw in complaint_keywords if re.search(rf'\b{re.escape(kw)}\b', text_lower)]
    return entities

def add_entity_features(df):
    extracted = df['ticket_text'].apply(extract_entities)
    df['num_products'] = extracted.apply(lambda x: len(x['products']))
    df['num_dates'] = extracted.apply(lambda x: len(x['dates']))
    df['num_complaints'] = extracted.apply(lambda x: len(x['complaint_keywords']))
    df['has_product'] = (df['num_products'] > 0).astype(int)
    df['has_date'] = (df['num_dates'] > 0).astype(int)
    df['has_complaint'] = (df['num_complaints'] > 0).astype(int)
    for kw in complaint_keywords:
        df[f'complaint_{kw.replace(" ", "_")}'] = extracted.apply(lambda x: int(kw in x['complaint_keywords']))
    return df

df = add_entity_features(df)


## 4. TF-IDF and Final Feature Matrix

In [27]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_text'])

sid = SentimentIntensityAnalyzer()
df['ticket_length'] = df['clean_text'].apply(lambda x: len(x.split()))
df['sentiment'] = df['ticket_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
df['exclamation_count'] = df['ticket_text'].apply(lambda x: x.count('!'))
df['question_count'] = df['ticket_text'].apply(lambda x: x.count('?'))
df['all_caps_count'] = df['ticket_text'].apply(lambda x: sum(1 for w in x.split() if w.isupper()))
df['char_length'] = df['ticket_text'].apply(len)

meta_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col != 'ticket_id']


X_meta = df[meta_cols].astype(np.float32).values

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_meta_scaled = scaler.fit_transform(X_meta)  # scale handcrafted features
X = np.hstack([X_tfidf.toarray(), X_meta_scaled])


In [28]:
print(X_tfidf.shape)


(629, 1563)


In [29]:
print(len(meta_cols))


20


In [30]:
print("TF-IDF features:", X_tfidf.shape[1])
print("Meta features:", len(meta_cols))
print("Total features:", X_tfidf.shape[1] + len(meta_cols))


TF-IDF features: 1563
Meta features: 20
Total features: 1583


In [31]:
print(meta_cols)
print(len(meta_cols))  # confirms 21


['num_products', 'num_dates', 'num_complaints', 'has_product', 'has_date', 'has_complaint', 'complaint_broken', 'complaint_late', 'complaint_error', 'complaint_issue', 'complaint_crash', 'complaint_not_working', 'complaint_damaged', 'complaint_fail', 'ticket_length', 'sentiment', 'exclamation_count', 'question_count', 'all_caps_count', 'char_length']
20


## 5. Model Training for Issue Type and Urgency Level

In [32]:
y_issue = df['issue_type']
y_urgency = df['urgency_level']

X_train, X_test, y_issue_train, y_issue_test, y_urgency_train, y_urgency_test = train_test_split(
    X, y_issue, y_urgency, test_size=0.2, random_state=42
)

issue_model = RandomForestClassifier(random_state=42)
issue_model.fit(X_train, y_issue_train)
y_issue_pred = issue_model.predict(X_test)
print("\nIssue Type Classification Report:")
print(classification_report(y_issue_test, y_issue_pred))

# ------------------ OPTIMIZED KNN URGENCY MODEL TRAINING ------------------

from sklearn.neighbors import KNeighborsClassifier

print("\nUrgency Level Classification Report (KNN with different k values):")
best_score = 0
best_model = None

for k in [3, 5, 7, 10, 15]:
    try:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_urgency_train)
        pred = knn.predict(X_test)
        score = accuracy_score(y_urgency_test, pred)
        print(f"\nK={k} | Accuracy: {score:.4f}")
        print(classification_report(y_urgency_test, pred))

        if score > best_score:
            best_score = score
            best_model = knn
    except Exception as e:
        print(f"K={k} failed. Error: {e}")



Issue Type Classification Report:
                    precision    recall  f1-score   support

    Account Access       1.00      1.00      1.00         9
   Billing Problem       1.00      1.00      1.00        18
   General Inquiry       1.00      1.00      1.00        16
Installation Issue       1.00      1.00      1.00        17
     Late Delivery       1.00      1.00      1.00        24
    Product Defect       1.00      1.00      1.00        21
        Wrong Item       1.00      1.00      1.00        21

          accuracy                           1.00       126
         macro avg       1.00      1.00      1.00       126
      weighted avg       1.00      1.00      1.00       126


Urgency Level Classification Report (KNN with different k values):

K=3 | Accuracy: 0.3254
              precision    recall  f1-score   support

        High       0.27      0.46      0.34        39
         Low       0.35      0.21      0.26        39
      Medium       0.41      0.31      0.35    

In [33]:
# ------------------ INFERENCE FUNCTION ------------------

def predict_ticket(text):
    clean = preprocess_text(text)
    X_tfidf = tfidf.transform([clean])

    # enriched features
    entities = extract_entities(text)
    features = [
        len(clean.split()),
        SentimentIntensityAnalyzer().polarity_scores(text)['compound'],
        text.count('!'),
        text.count('?'),
        sum(1 for w in text.split() if w.isupper()),
        len(text),
        len(entities['products']),
        len(entities['dates']),
        len(entities['complaint_keywords']),
        int(len(entities['products']) > 0),
        int(len(entities['dates']) > 0),
        int(len(entities['complaint_keywords']) > 0)
    ] + [int(kw in entities['complaint_keywords']) for kw in complaint_keywords]

    X = np.hstack([X_tfidf.toarray(), np.array([features], dtype=np.float32)])

    issue = issue_model.predict(X)[0]
    urgency = best_model.predict(X)[0]

    return {
        "issue_type": issue,
        "urgency_level": urgency,
        "entities": entities
    }


In [34]:
# ------------------ TEST EXAMPLE ------------------

if __name__ == "__main__":
    sample_text = "My phone crashed on 25/05/2024 and it hasn't worked since. Very frustrated!"
    result = predict_ticket(sample_text)
    print("\nSample Prediction:")
    print("Issue Type:", result["issue_type"])
    print("Urgency Level:", result["urgency_level"])
    print("Entities:", result["entities"])a



Sample Prediction:
Issue Type: Product Defect
Urgency Level: Medium
Entities: {'products': ['phone'], 'dates': ['25/05/2024'], 'complaint_keywords': []}


##6. GRADIO INTERFACE

In [37]:
# ------------------ GRADIO INTERFACE ------------------
try:
    import gradio as gr

    def gradio_interface(ticket_text):
        result = predict_ticket(ticket_text)
        return (
            result['issue_type'],        # first output - Issue Type textbox
            result['urgency_level'],     # second output - Urgency Level textbox
            result['entities']           # third output - JSON component
        )


    iface = gr.Interface(
        fn=gradio_interface,
        inputs=gr.Textbox(lines=5, placeholder="Enter a support ticket..."),
        outputs=[
            gr.Text(label="Issue Type"),
            gr.Text(label="Urgency Level"),
            gr.JSON(label="Extracted Entities")
        ],
        title="Customer Support Ticket Classifier",
        description="Classifies issue type and urgency level, and extracts product names, dates, and complaints from a support ticket."
    )

    iface.launch(share=False)

except ImportError:
    print("Gradio not installed. Run: pip install gradio")


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


## 6. Observations and Limitations

- **Issue Type** classification performs well using Random Forest.
- **Urgency Level** prediction is harder. Models confuse between Medium and High.
- Added features like exclamations, all-caps, complaint keywords, and sentiment helped slightly.
- Better results may require advanced semantic models or larger data.

**This completes Task 1.**
