In [2]:
import pandas as pd # for working with data
import numpy as np # for working with data
import seaborn as sns # for making visualizations
from matplotlib import pyplot as plt # for making visualizations
from sklearn.preprocessing import LabelEncoder # for encoding categorical variables
from sklearn.model_selection import train_test_split # for splitting the data into training and testing sets
from sklearn.ensemble import RandomForestClassifier # for building the model
from sklearn.metrics import classification_report, confusion_matrix # for evaluating the model
from sklearn.model_selection import GridSearchCV # for hyperparameter tuning
from sklearn.metrics import roc_curve, auc # for ROC curve and AUC

In [3]:
ticket_df = pd.read_csv('Support_tickets.csv')
ticket_df.head()
ticket_df.info()
ticket_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ticket_id               50000 non-null  int64  
 1   day_of_week             50000 non-null  object 
 2   day_of_week_num         50000 non-null  int64  
 3   company_id              50000 non-null  int64  
 4   company_size            50000 non-null  object 
 5   company_size_cat        50000 non-null  int64  
 6   industry                50000 non-null  object 
 7   industry_cat            50000 non-null  int64  
 8   customer_tier           50000 non-null  object 
 9   customer_tier_cat       50000 non-null  int64  
 10  org_users               50000 non-null  int64  
 11  region                  50000 non-null  object 
 12  region_cat              50000 non-null  int64  
 13  past_30d_tickets        50000 non-null  int64  
 14  past_90d_incidents      50000 non-null

(50000, 33)

In [4]:
ticket_df

Unnamed: 0,ticket_id,day_of_week,day_of_week_num,company_id,company_size,company_size_cat,industry,industry_cat,customer_tier,customer_tier_cat,...,downtime_min,payment_impact_flag,security_incident_flag,data_loss_flag,has_runbook,customer_sentiment,customer_sentiment_cat,description_length,priority,priority_cat
0,1000000000,Wed,3,100015,Small,1,media,7,Basic,1,...,6,0,0,0,0,neutral,2,227,low,1
1,1000000001,Sat,6,100023,Small,1,healthcare,5,Basic,1,...,2,0,0,0,0,neutral,2,461,low,1
2,1000000002,Mon,1,100012,Small,1,gaming,4,Basic,1,...,0,0,0,0,1,positive,3,306,low,1
3,1000000003,Wed,3,100003,Small,1,media,7,Plus,2,...,16,0,0,0,1,neutral,2,363,medium,2
4,1000000004,Mon,1,100019,Small,1,ecommerce,2,Plus,2,...,6,0,0,0,0,neutral,2,442,low,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1000049995,Wed,3,100021,Medium,2,ecommerce,2,Plus,2,...,0,0,0,0,0,neutral,2,483,low,1
49996,1000049996,Fri,5,100010,Large,3,saas_b2b,3,Plus,2,...,0,0,0,0,1,neutral,2,319,low,1
49997,1000049997,Tue,2,100002,Large,3,fintech,1,Enterprise,3,...,0,0,0,0,1,neutral,2,616,high,3
49998,1000049998,Wed,3,100022,Medium,2,saas_b2b,3,Enterprise,3,...,0,0,0,0,1,neutral,2,526,medium,2


In [5]:
ticket_df.isnull().sum()

ticket_id                   0
day_of_week                 0
day_of_week_num             0
company_id                  0
company_size                0
company_size_cat            0
industry                    0
industry_cat                0
customer_tier               0
customer_tier_cat           0
org_users                   0
region                      0
region_cat                  0
past_30d_tickets            0
past_90d_incidents          0
product_area                0
product_area_cat            0
booking_channel             0
booking_channel_cat         0
reported_by_role            0
reported_by_role_cat        0
customers_affected          0
error_rate_pct              0
downtime_min                0
payment_impact_flag         0
security_incident_flag      0
data_loss_flag              0
has_runbook                 0
customer_sentiment        906
customer_sentiment_cat      0
description_length          0
priority                    0
priority_cat                0
dtype: int

In [6]:
# 'priority' is the target column and all others are features
X = ticket_df.drop('priority', axis=1)
y = ticket_df['priority']

# If you have categorical columns, encode them first 
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Train a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

        high       1.00      1.00      1.00      1472
         low       1.00      1.00      1.00      5033
      medium       1.00      1.00      1.00      3495

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Confusion Matrix:
[[1472    0    0]
 [   0 5033    0]
 [   0    0 3495]]
