In [97]:
import pandas as pd 
import numpy as np 

In [98]:
df = pd.read_excel("ai_dev_assignment_tickets_complex_1000.xls")

In [99]:
# !pip install xlrd
# required for reading excel 

In [100]:
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


In [101]:
df.shape

(1000, 5)

In [102]:
df.dtypes

ticket_id         int64
ticket_text      object
issue_type       object
urgency_level    object
product          object
dtype: object

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ticket_id      1000 non-null   int64 
 1   ticket_text    945 non-null    object
 2   issue_type     924 non-null    object
 3   urgency_level  948 non-null    object
 4   product        1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [104]:
df.isnull().sum()

ticket_id         0
ticket_text      55
issue_type       76
urgency_level    52
product           0
dtype: int64

In [105]:
print(df['urgency_level'].value_counts())
print(df['issue_type'].value_counts())

urgency_level
High      330
Medium    319
Low       299
Name: count, dtype: int64
issue_type
Billing Problem       146
General Inquiry       146
Account Access        143
Installation Issue    142
Product Defect        121
Wrong Item            114
Late Delivery         112
Name: count, dtype: int64


In [None]:
df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level']).copy() # added copy to prevent SettingWithCopyWarning

In [107]:
# !pip install spacy


In [108]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [109]:
# !python -m spacy download en_core_web_sm

In [110]:
def preprocess_txt(text):
    if pd.isnull(text):
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)


In [111]:
df['ticket_text'][0]

'Payment issue for my SmartWatch V2. I was underbilled for order #29224.'

In [112]:
preprocess_txt(df['ticket_text'][0])

'payment issue smartwatch underbilled order'

In [113]:
df['ticket_text_clean'] = df['ticket_text'].apply(preprocess_txt)

In [114]:
df.sample(10)

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,ticket_text_clean
503,504,"Received wrong product, order mixed up.",Wrong Item,Medium,SmartWatch V2,receive wrong product order mix
655,656,EcoBreeze AC is stuck. It stopped working afte...,Product Defect,Medium,EcoBreeze AC,ecobreeze ac stick stop work day
698,699,Can't log in to mi account. Keeps showing unkn...,Account Access,Low,Vision LED TV,log mi account keep show unknown issue help
175,176,Is this item in stock? Delivery expected by 05...,General Inquiry,Medium,SoundWave 300,item stock delivery expect
130,131,Facing installation issue with EcoBreeze AC. S...,Installation Issue,Low,EcoBreeze AC,face installation issue ecobreeze ac setup fai...
349,350,Facing installation issue with . Setup fails a...,Installation Issue,High,SmartWatch V2,face installation issue setup fail step
348,349,Can you tell me more about the Vision LED TV w...,General Inquiry,High,Vision LED TV,tell vision lead tv warranty available blue
440,441,Order #81531 for RoboChef Blender is 3 days la...,Late Delivery,Medium,RoboChef Blender,order robochef blender day late order
530,531,Not able to install my product.,Installation Issue,Medium,EcoBreeze AC,able install product
485,486,Facing installation issue with RoboChef Blende...,Installation Issue,Low,RoboChef Blender,face installation issue robochef blender setup...


In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [126]:
tfidf = TfidfVectorizer(max_features=300)
X_tfidf = tfidf.fit_transform(df['ticket_text_clean'])

In [127]:
df['ticket_length'] = df['ticket_text_clean'].apply(lambda x: len(x.split()))

In [128]:
# !pip install TextBlob
from textblob import TextBlob

In [129]:
df['sentiment'] = df['ticket_text_clean'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [130]:
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,ticket_text_clean,ticket_length,sentiment
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,payment issue smartwatch underbilled order,5,0.0
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,order soundwave get ecobreeze ac instead order...,8,0.0
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,face installation issue photosnap cam setup fa...,8,-0.5
5,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,tell photosnap cam warranty available red,6,0.2
6,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,malfunction stop work day,4,0.0


In [131]:
y_issue = df['issue_type']
y_urgency = df['urgency_level']

In [132]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [133]:
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_tfidf, y_issue, test_size=0.2, random_state=42)
# for issue 

In [140]:
rf_issue = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rf_issue.fit(X_train_i, y_train_i)
y_pred_issue = rf_issue.predict(X_test_i)

In [141]:
print("classification report - issue type")
print(classification_report(y_test_i, y_pred_issue))

                    precision    recall  f1-score   support

    Account Access       1.00      0.96      0.98        23
   Billing Problem       1.00      1.00      1.00        19
   General Inquiry       0.96      1.00      0.98        25
Installation Issue       1.00      0.97      0.98        29
     Late Delivery       0.89      1.00      0.94        17
    Product Defect       0.97      1.00      0.98        30
        Wrong Item       1.00      0.91      0.95        23

          accuracy                           0.98       166
         macro avg       0.97      0.98      0.97       166
      weighted avg       0.98      0.98      0.98       166



In [143]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_issue, X_tfidf, y_issue, cv=5)
print("Cross-Validation Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())

Cross-Validation Accuracy Scores: [0.98795181 0.97575758 0.93939394 0.95757576 0.98787879]
Mean CV Accuracy: 0.9697115735669952


In [145]:
X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X_tfidf, y_urgency, test_size=0.2, random_state=42)
# for urgency level

In [146]:
rf_urgency = RandomForestClassifier(n_estimators=100, max_depth=3 ,random_state=42)
rf_urgency.fit(X_train_u, y_train_u)
y_pred_urgency = rf_urgency.predict(X_test_u)

print("Classification Report — Urgency Level:")
print(classification_report(y_test_u, y_pred_urgency))

Classification Report — Urgency Level:
              precision    recall  f1-score   support

        High       0.41      0.38      0.39        66
         Low       0.30      0.30      0.30        43
      Medium       0.35      0.39      0.37        57

    accuracy                           0.36       166
   macro avg       0.36      0.36      0.36       166
weighted avg       0.36      0.36      0.36       166

