In [None]:
# Import libraries
import pandas as pd


In [None]:
# Load and preview dataset
df = pd.read_csv("customer_support_tickets.csv", encoding='utf-8')
print("Columns:", df.columns.tolist())
df.head()


In [None]:
# Rename and validate columns
required_cols_mapping = {
    "Ticket Description": "Ticket Description",
    "Ticket Type": "Ticket Type",
    "Ticket Priority": "Ticket Priority",
    "gender": "Customer Gender"
}

# Check if required columns exist
missing = [v for v in required_cols_mapping.values() if v not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Standardize column names
df = df.rename(columns={v: k for k, v in required_cols_mapping.items()})
print("✅ Renamed columns:", df.columns.tolist())


In [None]:
import re
import string

# Custom text cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r"\s+", ' ', text)  # remove extra spaces
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    return text.strip()

# Apply cleaning to ticket descriptions
df["Ticket Description"] = df["Ticket Description"].apply(clean_text)

# Drop rows with empty values in key columns
df = df.dropna(subset=["Ticket Description", "Ticket Type", "Ticket Priority"])
print("✅ Cleaned ticket descriptions and dropped missing rows")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Display structure
print("\n📊 Dataset Info:")
print(df.info())

print("\n📈 Statistical Summary:")
print(df.describe(include='all'))

# Check missing values
print("\n🧼 Missing Values:")
print(df.isnull().sum())

# Value counts for categorical features
print("\n📌 Ticket Type Distribution:")
print(df["Ticket Type"].value_counts())

print("\n📌 Ticket Priority Distribution:")
print(df["Ticket Priority"].value_counts())

# Plot ticket type distribution
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x="Ticket Type", palette="viridis")
plt.title("Distribution of Ticket Types")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot ticket priority distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="Ticket Priority", palette="plasma")
plt.title("Distribution of Ticket Priorities")
plt.tight_layout()
plt.show()

# Ticket length analysis
df["Text Length"] = df["Ticket Description"].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(8, 4))
sns.histplot(df["Text Length"], bins=20, kde=True, color="teal")
plt.title("Distribution of Ticket Description Length")
plt.xlabel("Number of Words")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# BERT embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def bert_embed(texts):
    inputs = tokenizer(texts.tolist(), return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

X = bert_embed(df["cleaned_text"])

# Label encoding
from sklearn.preprocessing import LabelEncoder
le_type = LabelEncoder()
le_priority = LabelEncoder()

y_type = le_type.fit_transform(df["Ticket Type"])
y_priority = le_priority.fit_transform(df["Ticket Priority"])
y = np.vstack((y_type, y_priority)).T