# Ticket Urgency Classification

This notebook explores, preprocesses, and models customer support ticket data to predict urgency (priority).

## 1. Imports and Setup

In [None]:
import torch
import re
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from xgboost import XGBClassifier
from datasets import load_dataset
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
warnings.filterwarnings("ignore", category=UserWarning, message=".*use_label_encoder.*")
from ticket_urgency_classifier.config import RAW_DATA_DIR, INTERIM_DATA_DIR

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 2. Data Loading and Initial Exploration

In [None]:
dataset_file = RAW_DATA_DIR / "dataset.csv"
if not dataset_file.exists():
    print("Downloading dataset...")
    ds = load_dataset("Tobi-Bueck/customer-support-tickets")
    df_raw = ds['train'].to_pandas()
    df_raw.to_csv(dataset_file, index=False)
    print("Dataset saved.")
else:
    print("Dataset already exists.")

In [None]:
df = pd.read_csv(RAW_DATA_DIR / "dataset.csv")
df.info()

In [None]:
df.drop(columns=['version', 'answer'], inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
print(f"Shape after cleaning: {df.shape}")

## 3. Feature Engineering

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['priority'])
print(f"Training set size: {df_train.shape}")
print(f"Test set size: {df_test.shape}")

In [None]:
def engineer_features(df):
    """
    Applies feature engineering to the dataframe.
    - Creates 'full_text' by combining subject and body.
    - Extracts text statistics (sentiment, word count, etc.).
    - Counts occurrences of predefined keywords.
    - Creates an interaction feature between 'queue' and 'type'.
    """
    # Combine text fields for analysis
    df['full_text'] = df['subject'].fillna("") + " " + df['body'].fillna("")
    
    # Text statistics
    analyzer = SentimentIntensityAnalyzer()
    df['sentiment_score'] = df['full_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    df['word_count'] = df['full_text'].apply(lambda x: len(x.split()))
    df['exclamation_count'] = df['full_text'].str.count('!')
    df['question_mark_count'] = df['full_text'].str.count(r'\?')

    # Keyword extraction
    urgency_keywords = ['payment', 'failed', 'cannot access', 'login error', 'outage', 'urgent', 'asap', 'critical']
    question_keywords = ['how to', 'where is', 'can you', 'inquiry', 'question']
    bug_keywords = ['error code', 'exception', 'not working', 'crash', 'bug report', 'defect']
    
    urgency_regex = r'\b(' + '|'.join(urgency_keywords) + r')\b'
    question_regex = r'\b(' + '|'.join(question_keywords) + r')\b'
    bug_regex = r'\b(' + '|'.join(bug_keywords) + r')\b'

    df['urgency_keyword_count'] = df['full_text'].str.count(urgency_regex, flags=re.IGNORECASE)
    df['question_keyword_count'] = df['full_text'].str.count(question_regex, flags=re.IGNORECASE)
    df['bug_keyword_count'] = df['full_text'].str.count(bug_regex, flags=re.IGNORECASE)
    
    # Interaction feature
    df['queue_type_interaction'] = df['queue'].astype(str) + "_" + df['type'].astype(str)

    return df

# --- 2. Helper Function for Tag Intelligence ---
def add_tag_features(df, top_tags_list):
    """
    Adds binary features for the presence of top tags.
    """
    tag_cols = [f'tag_{i}' for i in range(1, 9)]
    df['all_tags_set'] = df[tag_cols].apply(lambda x: set(x.dropna()), axis=1)
    
    for tag in top_tags_list:
        col_name = f'tag_{tag.replace(" ", "_")}'
        df[col_name] = df['all_tags_set'].apply(lambda x: 1 if tag in x else 0)
    
    df.drop(columns='all_tags_set', inplace=True)
    return df

In [None]:
print("Applying base feature engineering...")
# Apply the main function to both sets
df_train = engineer_features(df_train.copy())
df_test = engineer_features(df_test.copy())

print("Implementing Tag Intelligence...")
# Define original tag columns
tag_cols = [f'tag_{i}' for i in range(1, 9)]

# Identify top tags from the TRAINING DATA ONLY to prevent data leakage
all_tags_series = df_train[tag_cols].stack()
top_30_tags = all_tags_series.value_counts().nlargest(30).index.tolist()

print("\nTop 30 most common tags identified from the training set:")
print(top_30_tags)

# Add the new binary tag features to both sets using the list from the training set
df_train = add_tag_features(df_train, top_30_tags)
df_test = add_tag_features(df_test, top_30_tags)

print("\nNew features have been added to df_train and df_test.")

In [None]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

### 3.1. Text Vectorization (TF-IDF and Sentence Transformers)

Here we generate and save the vectorized text features. This step can be slow and is only run if the feature files don't exist.

In [None]:
# Sentence Transformer embeddings - process train and test separately
st_file_train = INTERIM_DATA_DIR / "sentence_embeddings_train.parquet"
st_file_test = INTERIM_DATA_DIR / "sentence_embeddings_test.parquet"

if not st_file_train.exists() or not st_file_test.exists():
    print("Generating Sentence Transformer embeddings...")
    embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)
    
    # Generate embeddings for training data
    text_embeddings_train = embedder.encode(
        df_train['full_text'].tolist(),
        normalize_embeddings=True,
        show_progress_bar=True,
        batch_size=64
    )
    feature_cols = [f"emb_{i}" for i in range(text_embeddings_train.shape[1])]
    df_embeddings_train = pd.DataFrame(text_embeddings_train, columns=feature_cols)
    df_embeddings_train.to_parquet(st_file_train, index=False)
    
    # Generate embeddings for test data
    text_embeddings_test = embedder.encode(
        df_test['full_text'].tolist(),
        normalize_embeddings=True,
        show_progress_bar=True,
        batch_size=64
    )
    df_embeddings_test = pd.DataFrame(text_embeddings_test, columns=feature_cols)
    df_embeddings_test.to_parquet(st_file_test, index=False)
    
    print("Embeddings saved.")
else:
    print("Sentence Transformer embeddings files already exist.")

In [None]:
tfidf_file_train = INTERIM_DATA_DIR / "tfidf_train.parquet"
tfidf_file_test = INTERIM_DATA_DIR / "tfidf_test.parquet"

if not tfidf_file_train.exists() or not tfidf_file_test.exists():
    print("Generating TF-IDF features...")
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    
    tfidf_matrix_train = vectorizer.fit_transform(df_train['full_text'])
    tfidf_df_train = pd.DataFrame(
        tfidf_matrix_train.toarray(),
        columns=[f"tfidf_{i}" for i in range(tfidf_matrix_train.shape[1])]
    )
    tfidf_df_train.to_parquet(tfidf_file_train, index=False)
    
    tfidf_matrix_test = vectorizer.transform(df_test['full_text'])
    tfidf_df_test = pd.DataFrame(
        tfidf_matrix_test.toarray(),
        columns=[f"tfidf_{i}" for i in range(tfidf_matrix_test.shape[1])]
    )
    tfidf_df_test.to_parquet(tfidf_file_test, index=False)
    
    joblib.dump(vectorizer, INTERIM_DATA_DIR / "tfidf_vectorizer.joblib")
    print("TF-IDF features saved.")
else:  
    print("TF-IDF features files already exist.")

## 4. Model Preparation

In [None]:
df_sentence_transformer_train = pd.read_parquet(INTERIM_DATA_DIR / "sentence_embeddings_train.parquet")
df_sentence_transformer_test = pd.read_parquet(INTERIM_DATA_DIR / "sentence_embeddings_test.parquet")

df_tf_idf_train = pd.read_parquet(INTERIM_DATA_DIR / "tfidf_train.parquet")
df_tf_idf_test = pd.read_parquet(INTERIM_DATA_DIR / "tfidf_test.parquet")

In [None]:
# Define base features (non-text vector)
tag_cols = [f'tag_{i}' for i in range(1, 9)]
cols_to_drop = ["subject", "body", "full_text"] + tag_cols
df_train = df_train.drop(columns=cols_to_drop)
df_test = df_test.drop(columns=cols_to_drop)

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
# Consistent label encoding - fit only on training data
le = LabelEncoder()
target_col = 'priority'

# Fit encoder on training data only
df_train[target_col] = le.fit_transform(df_train[target_col])

# Transform test data using the fitted encoder
df_test[target_col] = le.transform(df_test[target_col])

print(f"Target classes: {le.classes_}")

# Save the fitted encoder for future use
joblib.dump(le, INTERIM_DATA_DIR / "label_encoder.joblib")

In [None]:
# Create final dataframes for each experiment
df_st_train = pd.concat([df_train, df_sentence_transformer_train], axis=1)
df_st_test = pd.concat([df_test, df_sentence_transformer_test], axis=1)
df_tfidf_train = pd.concat([df_train, df_tf_idf_train], axis=1)
df_tfidf_test = pd.concat([df_test, df_tf_idf_test], axis=1)

In [None]:
# --- TF-IDF Dataset ---
X_train_tfidf = df_tfidf_train.drop(columns=[target_col])
y_train_tfidf = df_tfidf_train[target_col]
X_test_tfidf = df_tfidf_test.drop(columns=[target_col])
y_test_tfidf = df_tfidf_test[target_col]

# --- Sentence Transformer Dataset ---
X_train_st = df_st_train.drop(columns=[target_col])
y_train_st = df_st_train[target_col]
X_test_st = df_st_test.drop(columns=[target_col])
y_test_st = df_st_test[target_col]

# Create a dictionary to hold the datasets for easy iteration
datasets = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf),
    "SentenceTransformer": (X_train_st, X_test_st, y_train_st, y_test_st)
}

print("Shapes for TF-IDF dataset:")
print(f"X_train: {X_train_tfidf.shape}, y_train: {y_train_tfidf.shape}")
print(f"X_test: {X_test_tfidf.shape}, y_test: {y_test_tfidf.shape}")
print("\nShapes for SentenceTransformer dataset:")
print(f"X_train: {X_train_st.shape}, y_train: {y_train_st.shape}")
print(f"X_test: {X_test_st.shape}, y_test: {y_test_st.shape}")

## 5. Model Training and Evaluation

In [None]:
models = {
    "logistic_l2": LogisticRegression(
        solver='lbfgs', 
        class_weight='balanced', 
        max_iter=1000, 
        random_state=42,
        C=0.1, 
        penalty='l2', 
        multi_class='multinomial' 
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=200,  # Increased number of trees
        max_depth=15,  # Slightly increased depth
        class_weight='balanced_subsample', 
        random_state=42, 
        n_jobs=-1,
        min_samples_split=5,  
        min_samples_leaf=2,  
        max_features='sqrt',  
        bootstrap=True,  
        oob_score=True  
    ),
    "xgboost": XGBClassifier(
        tree_method="hist", 
        device="cuda", 
        objective='multi:softprob', 
        num_class=3, 
        n_estimators=200,  
        learning_rate=0.05,  
        max_depth=6,  
        eval_metric='mlogloss', 
        random_state=42,
        subsample=0.8,  
        colsample_bytree=0.8,  
        reg_alpha=0.1,  
        reg_lambda=1.0,  
        min_child_weight=1  
    ),
    "lightgbm": LGBMClassifier(
        random_state=42,
        n_estimators=200,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,  # Maximum tree leaves for base learners
        subsample=0.8,  # Subsample ratio of the training instances
        colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
        reg_alpha=0.1,  # L1 regularization
        reg_lambda=0.1,  # L2 regularization
        min_child_samples=20,  # Minimum number of data needed in a child (leaf)
        class_weight='balanced',
        n_jobs=-1
    ),
    "catboost": CatBoostClassifier(
        random_state=42,
        iterations=200,  # Number of trees
        learning_rate=0.05,
        depth=6,  # Depth of the trees
        l2_leaf_reg=3,  # L2 regularization term
        border_count=128,  # Number of splits for numerical features
        loss_function='MultiClass',  # For multiclass classification
        verbose=0,  # To suppress output
        auto_class_weights='Balanced'  # Handles class imbalance
    )
}

In [None]:
for d_name, (X_train, X_test, y_train, y_test) in datasets.items():
    print("="*50)
    print(f"RESULTS FOR FEATURE SET: {d_name}")
    print("="*50)
    
    categorical_features = ['type', 'queue', 'language', 'queue_type_interaction']
    numerical_features = [col for col in X_train.columns if col not in categorical_features]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
            ("num", StandardScaler(), numerical_features)
        ],
        remainder="passthrough"
    )

    for model_name, model in models.items():
        clf = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        # Handle different models with their specific requirements
        if model_name == "xgboost" or model_name == "lightgbm" or model_name == "catboost":
            sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
            clf.fit(X_train, y_train, model__sample_weight=sample_weights)
        else:
            clf.fit(X_train, y_train)
            
        y_pred = clf.predict(X_test)

        print(f"\n--- Model: {model_name} ---")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=le.classes_))
        
        # Additional metrics
        if hasattr(clf, 'predict_proba'):
            try:
                y_proba = clf.predict_proba(X_test)
                print(f"ROC AUC: {roc_auc_score(y_test, y_proba, multi_class='ovr'):.4f}")
            except:
                pass

## 6. Hyperparameter Tuning (XGBoost)

Based on the results above, we select the best combination (e.g., SentenceTransformer features with XGBoost) and perform a more rigorous hyperparameter search.

In [None]:
X_train_tune, X_test_tune, y_train_tune, y_test_tune = datasets['SentenceTransformer']

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(
        tree_method = "hist", device = "cuda", objective='multi:softprob', num_class=3, eval_metric='mlogloss', random_state=42
    )
    )
])

param_dist = {
    'model__max_depth': randint(4, 8),
    'model__learning_rate': uniform(0.01, 0.2),
    'model__n_estimators': randint(100, 400),
    'model__subsample': uniform(0.5, 0.5),        
    'model__colsample_bytree': uniform(0.5, 0.5)
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist,
    n_iter=15,
    scoring='f1_weighted',
    cv=cv,
    n_jobs=1,
    verbose=2,
    random_state=42
)

print("Starting RandomizedSearchCV for XGBoost with SentenceTransformer features...")
sample_weights_tune = compute_sample_weight(class_weight='balanced', y=y_train_tune)
random_search.fit(X_train_tune, y_train_tune, model__sample_weight=sample_weights_tune)

In [None]:
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation F1-weighted score: ", random_search.best_score_)

## XGBOOST Final Model Evaluation

In [None]:
best_model = random_search.best_estimator_
joblib.dump(best_model, 'best_xgboost_model.joblib')

y_pred_final = best_model.predict(X_test_tune)
print(f'accurcy socres: {accuracy_score(y_test_tune, y_pred_final):.4f}')
print("Final Classification Report for the Tuned XGBoost Model:")
print(classification_report(y_test_tune, y_pred_final, target_names=le.classes_))

In [None]:
cm = confusion_matrix(y_test_tune, y_pred_final)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=le.classes_, yticklabels=le.classes_
)
plt.title('Confusion Matrix for Best Tuned Model')
plt.ylabel('Actual Priority')
plt.xlabel('Predicted Priority')
plt.show()

## 6. Hyperparameter Tuning (Random Forest)


In [None]:
X_train_tune, X_test_tune, y_train_tune, y_test_tune = datasets['SentenceTransformer']

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

param_dist = {
    'model__n_estimators': randint(100, 400),
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': randint(2, 11),
    'model__min_samples_leaf': randint(1, 5),
    'model__max_features': ['sqrt', 'log2']
}

# Set up Stratified K-Fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Set up RandomSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_dist,
    n_iter=15,
    scoring='f1_weighted',
    cv=cv,
    n_jobs=1,
    verbose=2,
    random_state=42
)

print("Starting RandomizedSearchCV for RandomForest with SentenceTransformer features...")

random_search.fit(X_train_tune, y_train_tune)

print("\n RandomizedSearchCV for RandomForest finished.")

In [None]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best weighted F1-score from CV: {random_search.best_score_:.4f}")

In [None]:
best_rf_model = random_search.best_estimator_

In [None]:
joblib.dump(best_rf_model, 'best_rf_model.joblib')

## 7. RANDOM FOREST Final Model Evaluation

In [None]:
y_pred_final = best_rf_model.predict(X_test_tune)
print(f"\nWeighted F1-score on the test set: {f1_score(y_test_tune, y_pred_final, average='weighted'):.4f}")
print(f'accurcy socres: {accuracy_score(y_test_tune, y_pred_final):.4f}')

print("Final Classification Report for the Tuned Random Forest Model:")
print(classification_report(y_test_tune, y_pred_final, target_names=le.classes_))

In [None]:
cm = confusion_matrix(y_test_tune, y_pred_final)
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=le.classes_, yticklabels=le.classes_
)
plt.title('Confusion Matrix for Best Tuned Model')
plt.ylabel('Actual Priority')
plt.xlabel('Predicted Priority')
plt.show()

## Threshold Tuning

In [None]:
# --- Threshold Tuning for 'low' class recall ---
print("\n--- Starting Threshold Tuning for 'low' class ---")

y_proba = best_rf_model.predict_proba(X_test_tune)

# Ensure class order matches label encoder (high, low, medium)
class_names = le.classes_ # Should be ['high', 'low', 'medium']
try:
    low_class_index = np.where(class_names == 'low')[0][0]
    high_class_index = np.where(class_names == 'high')[0][0]
    medium_class_index = np.where(class_names == 'medium')[0][0]
except IndexError:
    print("Error: Could not find 'low', 'high', or 'medium' class in label encoder.")
    raise

# --- Corrected Threshold Tuning Loop ---
thresholds = np.arange(0.10, 0.51, 0.01)

# Initialize variables to store best results
best_threshold = 0.0
best_weighted_f1 = 0.0
y_pred_best_thresh = y_pred_final # Start with the original best predictions
original_weighted_f1 = f1_score(y_test_tune, y_pred_final, average='weighted')

print(f"Original Weighted F1-Score: {original_weighted_f1:.4f}")

# Iterate through thresholds
for thresh in thresholds:
    # 1. Initialize predictions with a placeholder
    y_pred_thresh = np.zeros(len(y_test_tune), dtype=int)

    # 2. Find all rows where the 'low' probability is above the current threshold
    low_mask = y_proba[:, low_class_index] >= thresh

    # 3. For those rows, assign the 'low' class directly
    y_pred_thresh[low_mask] = low_class_index

    # 4. For all other rows, decide between 'high' and 'medium'
    not_low_mask = ~low_mask
    # Create a temporary probability array for non-low predictions
    temp_proba = y_proba[not_low_mask].copy()
    # Set the 'low' class probability to zero to ensure it's not chosen
    temp_proba[:, low_class_index] = 0
    
    # Predict the remaining classes based on the highest remaining probability
    if temp_proba.shape[0] > 0:
        remaining_preds = np.argmax(temp_proba, axis=1)
        y_pred_thresh[not_low_mask] = remaining_preds

    # 5. Calculate the score and update if it's the best
    current_weighted_f1 = f1_score(y_test_tune, y_pred_thresh, average='weighted')

    if current_weighted_f1 > best_weighted_f1:
        best_weighted_f1 = current_weighted_f1
        best_threshold = thresh
        y_pred_best_thresh = y_pred_thresh.copy()

# 5. Report results
print(f"\nBest Threshold for 'low' class: {best_threshold:.2f}")
print(f"Best Weighted F1-Score achieved: {best_weighted_f1:.4f}")
print(f"Improvement: {best_weighted_f1 - original_weighted_f1:.4f}")

# 6. (Optional) Print the classification report for the best threshold
if 'y_pred_best_thresh' in locals():
    print("\nClassification Report for Best Threshold Tuned Model:")
    print(classification_report(y_test_tune,   y_pred_best_thresh, target_names=class_names))
else:
    print("\nNo improvement found, best threshold remains at 0.5 (default).")