In [None]:
# Import libraries
import pandas as pd


In [None]:
# Load and preview dataset
df = pd.read_csv("customer_support_tickets.csv", encoding='utf-8')
print("Columns:", df.columns.tolist())
df.head()


In [None]:
# Rename and validate columns
required_cols_mapping = {
    "Ticket Description": "Ticket Description",
    "Ticket Type": "Ticket Type",
    "Ticket Priority": "Ticket Priority",
    "gender": "Customer Gender"
}

# Check if required columns exist
missing = [v for v in required_cols_mapping.values() if v not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Standardize column names
df = df.rename(columns={v: k for k, v in required_cols_mapping.items()})
print("✅ Renamed columns:", df.columns.tolist())


In [None]:
import re
import string

# Custom text cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r"\s+", ' ', text)  # remove extra spaces
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    return text.strip()

# Apply cleaning to ticket descriptions
df["Ticket Description"] = df["Ticket Description"].apply(clean_text)

# Drop rows with empty values in key columns
df = df.dropna(subset=["Ticket Description", "Ticket Type", "Ticket Priority"])
print("✅ Cleaned ticket descriptions and dropped missing rows")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Display structure
print("\n📊 Dataset Info:")
print(df.info())

print("\n📈 Statistical Summary:")
print(df.describe(include='all'))

# Check missing values
print("\n🧼 Missing Values:")
print(df.isnull().sum())

# Value counts for categorical features
print("\n📌 Ticket Type Distribution:")
print(df["Ticket Type"].value_counts())

print("\n📌 Ticket Priority Distribution:")
print(df["Ticket Priority"].value_counts())

# Plot ticket type distribution
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x="Ticket Type", palette="viridis")
plt.title("Distribution of Ticket Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot ticket priority distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="Ticket Priority", palette="plasma")
plt.title("Distribution of Ticket Priorities")
plt.tight_layout()
plt.show()

# Ticket length analysis
df["Text Length"] = df["Ticket Description"].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(8, 4))
sns.histplot(df["Text Length"], bins=20, kde=True, color="teal")
plt.title("Distribution of Ticket Description Length")
plt.xlabel("Number of Words")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# BERT embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def bert_embed(texts):
    inputs = tokenizer(texts.tolist(), return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

X = bert_embed(df["cleaned_text"])

# Label encoding
from sklearn.preprocessing import LabelEncoder
le_type = LabelEncoder()
le_priority = LabelEncoder()

y_type = le_type.fit_transform(df["Ticket Type"])
y_priority = le_priority.fit_transform(df["Ticket Priority"])
y = np.vstack((y_type, y_priority)).T

In [None]:
from imblearn.over_sampling import SMOTE

# Split original
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate labels
y_type_train = y_train[:, 0]
y_priority_train = y_train[:, 1]

# Apply SMOTE only on 'type'
smote = SMOTE(random_state=42)
X_train_res, y_type_res = smote.fit_resample(X_train, y_type_train)

# Fix: Use the indices of X_train in the resampled data
# We'll use the nearest neighbors to rebuild matching priority labels
# This approximation works fine for multi-output
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=1).fit(X_train)
_, indices = nn.kneighbors(X_train_res)

priority_res = y_priority_train[indices.flatten()]

# Combine
y_train_res = np.vstack((y_type_res, priority_res)).T

# Train model
model = MultiOutputClassifier(LogisticRegression(max_iter=500))
model.fit(X_train_res, y_train_res)


In [None]:
from imblearn.over_sampling import SMOTE

# Split original
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate labels
y_type_train = y_train[:, 0]
y_priority_train = y_train[:, 1]

# Apply SMOTE only on 'type'
smote = SMOTE(random_state=42)
X_train_res, y_type_res = smote.fit_resample(X_train, y_type_train)

# Fix: Use the indices of X_train in the resampled data
# We'll use the nearest neighbors to rebuild matching priority labels
# This approximation works fine for multi-output
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=1).fit(X_train)
_, indices = nn.kneighbors(X_train_res)

priority_res = y_priority_train[indices.flatten()]

# Combine
y_train_res = np.vstack((y_type_res, priority_res)).T

# Train model
model = MultiOutputClassifier(LogisticRegression(max_iter=500))
model.fit(X_train_res, y_train_res)


In [None]:
# Evaluate
y_pred = model.predict(X_test)

def evaluate_model(y_true, y_pred, target_names):
    metrics = {}
    for i, name in enumerate(target_names):
        p, r, f, _ = precision_recall_fscore_support(y_true[:, i], y_pred[:, i], average='weighted', zero_division=0)
        metrics[name] = {'precision': round(p, 3), 'recall': round(r, 3), 'f1': round(f, 3)}
    return metrics

metrics = evaluate_model(y_test, y_pred, ["type", "priority"])
print(metrics)

# Confusion Matrices
ConfusionMatrixDisplay.from_predictions(y_test[:,0], y_pred[:,0])  # Type
ConfusionMatrixDisplay.from_predictions(y_test[:,1], y_pred[:,1])  # Priority


In [None]:
# Quick Model Evaluation (Fix for Type and Priority separately)

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

try:
    # Ticket Type evaluation
    print("🔹 Classification Report for Ticket Type:")
    print(classification_report(y_test[:, 0], y_pred[:, 0]))  # 0: Ticket Type

    cm_type = confusion_matrix(y_test[:, 0], y_pred[:, 0])
    disp_type = ConfusionMatrixDisplay(confusion_matrix=cm_type)
    disp_type.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix - Ticket Type")
    plt.tight_layout()
    plt.show()

    # Ticket Priority evaluation
    print("\n🔸 Classification Report for Ticket Priority:")
    print(classification_report(y_test[:, 1], y_pred[:, 1]))  # 1: Priority

    cm_priority = confusion_matrix(y_test[:, 1], y_pred[:, 1])
    disp_priority = ConfusionMatrixDisplay(confusion_matrix=cm_priority)
    disp_priority.plot(cmap=plt.cm.Oranges)
    plt.title("Confusion Matrix - Ticket Priority")
    plt.tight_layout()
    plt.show()

except Exception as e:
    print(f"⚠️ Evaluation Error: {str(e)}")
    print("Make sure y_test and y_pred are NumPy arrays with two columns (type and priority).")

In [None]:
import shap
import matplotlib.pyplot as plt
import numpy as np

# Use smaller test sample
X_sample = X_test[:5]

# Predict class probabilities
def predict_type(X):
    return model.estimators_[0].predict_proba(X)

# Get class index with highest predicted probability
predicted_class_type = np.argmax(predict_type(X_sample), axis=1)[0]

# Explain only that class
explainer_type = shap.Explainer(predict_type, X_sample)
shap_values_type = explainer_type(X_sample, max_evals=2000)

# Use only the SHAP values for predicted class
shap.plots.beeswarm(shap_values_type[:, :, predicted_class_type], show=False)
plt.title("SHAP - Ticket Type (Top Class)")
plt.savefig("shap_type_beeswarm.png", bbox_inches='tight')
plt.close()


In [None]:
# SHAP wrapper for priority prediction
def predict_priority(X):
    return model.estimators_[1].predict_proba(X)

# Create SHAP explainer
explainer_priority = shap.Explainer(predict_priority, X_sample)

# Compute SHAP values
shap_values_priority = explainer_priority(X_sample, max_evals=2000)

# Plot and save
shap.plots.beeswarm(shap_values_priority[:, :, y_test[0][1]], show=False)
plt.title("SHAP - Ticket Priority")
plt.savefig("shap_priority_beeswarm.png", bbox_inches='tight')
plt.close()


In [None]:
# LIME Explanation
explainer = LimeTextExplainer(class_names=le_type.classes_)

def explain_pred(text):
    exp = explainer.explain_instance(
        text_instance=text,
        classifier_fn=lambda x: model.predict(bert_embed(pd.Series(x))),
        num_features=10
    )
    return exp.show_in_notebook()

In [None]:
# Choose any ticket message to test LIME
sample_text = df["cleaned_text"].iloc[25]  # You can change the index to test other examples
explain_pred(sample_text)
