<a href="https://colab.research.google.com/github/SalmonSung/ba_thesis/blob/main/clean_db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Load and unzip the original dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

zip_path = '/content/drive/MyDrive/ba_thesis/IT_support_tickets.zip'

import zipfile
import os

extract_dir = '/content/unzipped_contents'
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

for root, dirs, files in os.walk(extract_dir):
    for name in files:
        print(os.path.relpath(os.path.join(root, name), extract_dir))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
dataset-tickets-german_normalized.csv
dataset-tickets-multi-lang3-4k.csv
dataset-tickets-german_normalized_50_5_2.csv
dataset-tickets-multi-lang-4-20k.csv
aa_dataset-tickets-multi-lang-5-2-50-version.csv


# 2. Cleean the dataset

In [None]:
import pandas as pd

df = pd.read_csv('/content/unzipped_contents/aa_dataset-tickets-multi-lang-5-2-50-version.csv')
# counts = df['queue'].value_counts()
# print(counts)

queues = ["Technical Support", "Product Support", "Customer Service", "IT Support", "Billing and Payments"]
filtered_df = df[
    (df["language"] == "en") &
    (df["queue"].isin(queues))
]
display(filtered_df[["subject", "body", "queue"]])

Unnamed: 0,subject,body,queue
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Technical Support
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Billing and Payments
5,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Technical Support
7,Connectivity Problems with Printer on MacBook Pro,"Dear Support Team,\n\nI am reporting a recurri...",Technical Support
10,VPN Access Issue,"Customer Support,\n\nWe are encountering a dis...",Product Support
...,...,...,...
28578,Problem with Billing Adjustment,An unexpected billing discrepancy has been not...,Billing and Payments
28580,Urgent: Incident Involving Data Breach in Medi...,"A data breach has occurred, which might be rel...",Product Support
28582,Performance Problem with Data Analytics Tool,The data analytics tool experiences sluggish p...,Technical Support
28585,Update Request for SaaS Platform Integration F...,Requesting an update on the integration featur...,IT Support


# 3. Train-test split

In [None]:
from sklearn.model_selection import train_test_split

cols = ["subject", "body", "queue"]
df_subset = filtered_df[cols]

train_df, test_df = train_test_split(
    df_subset,
    test_size=0.25,
    stratify=df_subset['queue'],
    random_state=42,
    shuffle=True
)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Queue distribution in train set:")
print(train_df['queue'].value_counts(normalize=True))
print("\nQueue distribution in test set:")
print(test_df['queue'].value_counts(normalize=True))
train_df.to_csv('train_set.csv', index=False)
test_df.to_csv('test_set.csv', index=False)

Train shape: (10317, 3)
Test shape: (3440, 3)
Queue distribution in train set:
queue
Technical Support       0.344383
Product Support         0.223418
Customer Service        0.175148
IT Support              0.141126
Billing and Payments    0.115925
Name: proportion, dtype: float64

Queue distribution in test set:
queue
Technical Support       0.344186
Product Support         0.223256
Customer Service        0.175291
IT Support              0.141279
Billing and Payments    0.115988
Name: proportion, dtype: float64
