In [42]:
pip install pandas numpy matplotlib seaborn scikit-learn xgboost joblib gdown scipy nltk imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4
Note: you may need to restart the kernel to use updated packages.


## 1) Imports & downloads

In [43]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, f1_score
# imbalanced-learn pipeline & samplers
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

sns.set(style='whitegrid')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pranu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2) Load & quick inspect

In [27]:
df = pd.read_csv('./customer_support_tickets.csv')
df.shape, df.columns
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

## 3) Basic cleaning / datatypes

In [28]:
# convert date columns
for col in ['Date of Purchase', 'First Response Time', 'Time to Resolution']:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# numeric conversion if needed
df['Customer Age'] = pd.to_numeric(df['Customer Age'], errors='coerce')

# preview missing
df.isna().sum().sort_values(ascending=False)

Customer Satisfaction Rating    5700
Time to Resolution              5700
Resolution                      5700
First Response Time             2819
Ticket Description                 0
Ticket Channel                     0
Ticket Priority                    0
Ticket Status                      0
Ticket ID                          0
Customer Name                      0
Ticket Type                        0
Date of Purchase                   0
Product Purchased                  0
Customer Gender                    0
Customer Age                       0
Customer Email                     0
Ticket Subject                     0
dtype: int64

## 4) Target creation (binary)

In [29]:
# create binary target: satisfied (4,5) vs not (1-3)
df = df.copy()
df['satisfied'] = df['Customer Satisfaction Rating'].apply(lambda x: 1 if x>=4 else 0)
df['satisfied'].value_counts(normalize=True)

satisfied
0    0.87165
1    0.12835
Name: proportion, dtype: float64

## 5) Feature engineering (examples)

In [30]:
# response time in minutes (if first response and ticket creation exist)
# If dataset lacks creation time, use 'First Response Time' and 'Time to Resolution' directly.
# Example: compute resolution duration in hours (if both datetimes)
if 'First Response Time' in df.columns and 'Time to Resolution' in df.columns:
    df['resolution_seconds'] = (df['Time to Resolution'] - df['First Response Time']).dt.total_seconds()
    df['resolution_hours'] = df['resolution_seconds']/3600

# Extract product category / ticket priority / channel
# Fill NAs
df['Ticket Priority'] = df['Ticket Priority'].fillna('Unknown')
df['Ticket Channel'] = df['Ticket Channel'].fillna('Unknown')

## 6) Text cleaning and TF-IDF

In [35]:
# keep description
df['Ticket Description'] = df['Ticket Description'].fillna('')

def simple_clean(text):
    text = str(text).lower()
    # remove basic punctuation
    text = ''.join(ch for ch in text if ch.isalnum() or ch.isspace())
    return text

df['desc_clean'] = df['Ticket Description'].apply(simple_clean)

# TF-IDF vectorizer (limit features)
tfidf = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1,2))
# Example: fit later inside pipeline

## 7) Choose features & ColumnTransformer pipeline

In [49]:
# 1) Recreate TF-IDF and preprocessor (replace previous definitions)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# TF-IDF (use built-in english stopwords)
tfidf = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1,2))

# Update these lists to match your DataFrame columns (they will be filtered automatically)
text_col = 'desc_clean'
categorical_cols = ['Customer Gender', 'Product Purchased', 'Ticket Type', 'Ticket Priority', 'Ticket Channel']
numeric_cols = ['Customer Age', 'resolution_hours']

categorical_cols = [c for c in categorical_cols if c in df.columns]
numeric_cols = [c for c in numeric_cols if c in df.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('txt', tfidf, text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ],
    remainder='drop',
    sparse_threshold=0.3
)

print("preprocessor created with OneHotEncoder(sparse_output=True).")
print("Categorical cols used:", categorical_cols)
print("Numeric cols used:", numeric_cols)


preprocessor created with OneHotEncoder(sparse_output=True).
Categorical cols used: ['Customer Gender', 'Product Purchased', 'Ticket Type', 'Ticket Priority', 'Ticket Channel']
Numeric cols used: ['Customer Age', 'resolution_hours']


## 8) Build pipeline & train/test

In [50]:
# quick checks - run this cell before training
print("Train target distribution:")
print(y_train.value_counts(), "\n")
print("Train ratio (neg/pos):")
print((y_train==0).sum(), "/", (y_train==1).sum(), "  -> ratio:", (y_train==0).sum() / ((y_train==1).sum()+1e-9))

print("\nExample X_train columns (head):")
display(X_train.head() if hasattr(X_train, 'head') else X_train[:3])

# Make sure X_train is a DataFrame so ColumnTransformer using column names works
import pandas as pd
if not isinstance(X_train, pd.DataFrame):
    print("\nNOTE: X_train is not a DataFrame. Convert it before pipeline by using:")
    print("X = pd.DataFrame(X)  # or rebuild X from df using column names")


Train target distribution:
satisfied
0    5905
1     870
Name: count, dtype: int64 

Train ratio (neg/pos):
5905 / 870   -> ratio: 6.787356321831279

Example X_train columns (head):


Unnamed: 0,desc_clean,Customer Gender,Product Purchased,Ticket Type,Ticket Priority,Ticket Channel,Customer Age,resolution_hours
7157,im having an issue with the productpurchased p...,Female,iPhone,Product inquiry,Medium,Phone,60,11.483333
6402,im having trouble connecting my productpurchas...,Female,LG Washing Machine,Technical issue,High,Social media,30,0.166667
5805,im having an issue with the productpurchased p...,Female,Nikon D,Billing inquiry,Low,Email,27,-6.466667
3782,im encountering a software bug in the productp...,Male,HP Pavilion,Cancellation request,Low,Social media,56,0.166667
6546,im unable to access my productpurchased accoun...,Other,Microsoft Xbox Controller,Product inquiry,Medium,Phone,50,0.166667


In [51]:
# ======= Simple class_weight approach =======
clf = RandomForestClassifier(n_estimators=200, max_depth=12, class_weight='balanced',
                             random_state=42, n_jobs=-1)

from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

print("Fitting RandomForest with class_weight='balanced' ...")
pipe.fit(X_train, y_train)

# evaluate
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe, "predict_proba") else None

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

if y_proba is not None:
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    prec, rec, _ = precision_recall_curve(y_test, y_proba)
    print("PR AUC:", auc(rec, prec))



Fitting RandomForest with class_weight='balanced' ...
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1477
           1       0.34      0.36      0.35       217

    accuracy                           0.83      1694
   macro avg       0.62      0.63      0.63      1694
weighted avg       0.83      0.83      0.83      1694

Confusion Matrix:
 [[1326  151]
 [ 139   78]]
ROC AUC: 0.6667769079807431
PR AUC: 0.2689258727043317


## 9) Try XGBoost or Logistic with GridSearch

In [52]:
# Example grid (small)
param_grid = {
    'clf__n_estimators': [100,200],
    'clf__max_depth': [6,12]
}
gs = GridSearchCV(pipe, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
print('Best params', gs.best_params_)
best = gs.best_estimator_
y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best params {'clf__max_depth': 6, 'clf__n_estimators': 100}
              precision    recall  f1-score   support

           0       0.88      0.68      0.77      1477
           1       0.14      0.36      0.20       217

    accuracy                           0.64      1694
   macro avg       0.51      0.52      0.49      1694
weighted avg       0.78      0.64      0.70      1694



## 10) Save model

In [53]:
os.makedirs('models', exist_ok=True)
joblib.dump(best, 'models/best_model.joblib')


['models/best_model.joblib']

## 11) Explainability : top textual features)


In [54]:
# to get TF-IDF features and coefficients (if using linear model)
# If using RandomForest, extract feature importances after transforming sample input