####Loading the Data from Kaggle

In [1]:
import pandas as pd
import pickle
import numpy as np
import tensorflow as tf

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tobiasbueck/multilingual-customer-support-tickets")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tobiasbueck/multilingual-customer-support-tickets?dataset_version_number=14...


100%|██████████| 16.1M/16.1M [00:00<00:00, 16.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tobiasbueck/multilingual-customer-support-tickets/versions/14


In [3]:
import os

path_to_dataset = "/root/.cache/kagglehub/datasets/tobiasbueck/multilingual-customer-support-tickets/versions/14"

os.listdir(path_to_dataset)

['dataset-tickets-german_normalized.csv',
 'dataset-tickets-german_normalized_50_5_2.csv',
 'aa_dataset-tickets-multi-lang-5-2-50-version.csv',
 'dataset-tickets-multi-lang-4-20k.csv',
 'dataset-tickets-multi-lang3-4k.csv']

In [4]:
BASE_DIR = "/root/.cache/kagglehub/datasets/tobiasbueck/multilingual-customer-support-tickets/versions/14/dataset-tickets-multi-lang-4-20k.csv"

df = pd.read_csv(BASE_DIR)

print(df.head(20))

                                              subject  \
0   Unvorhergesehener Absturz der Datenanalyse-Pla...   
1                            Customer Support Inquiry   
2                       Data Analytics for Investment   
3                  Krankenhaus-Dienstleistung-Problem   
4                                            Security   
5   Concerns About Securing Medical Data on 2-in-1...   
6   Ratung für Sicherung medizinischer Daten in Hu...   
7                            Problem with Integration   
8                                  Assistance Request   
9                                     Support Request   
10  Issue with Data Analytics Platform - Insuffici...   
11  Reported Issue with Project Sync Resulting in ...   
12                     Strategies for Brand Expansion   
13             Issue with Website Analytics Dashboard   
14  Urgent: Data Breach Identified in Hospital Net...   
15       Request for Assistance with Data Integration   
16  Request for Updating Integr

####Creating the data format for the tokenization

In [5]:
df["subject"] = df["subject"].fillna("")
df["body"] = df["body"].fillna("")

df["Final_Text"] = df["subject"] + " " + df["body"]

In [6]:
df = df.dropna(subset=["queue"])

df = df[df["Final_Text"].str.strip() != ""]

In [7]:
print("Subject: \n", df["subject"], "\n")
print("Body \n", df["body"], "\n")
print("Final Text \n", df["Final_Text"], "\n")

Subject: 
 0        Unvorhergesehener Absturz der Datenanalyse-Pla...
1                                 Customer Support Inquiry
2                            Data Analytics for Investment
3                       Krankenhaus-Dienstleistung-Problem
4                                                 Security
                               ...                        
19995       Assistance Needed for IFTTT Docker Integration
19996          Bitten um Unterstützung bei der Integration
19997                                                     
19998              Hilfe bei digitalen Strategie-Problemen
19999    Optimierung Ihrer Datenanalyse-Plattform erlei...
Name: subject, Length: 20000, dtype: object 

Body 
 0        Die Datenanalyse-Plattform brach unerwartet ab...
1        Seeking information on digital strategies that...
2        I am contacting you to request information on ...
3        Ein Medien-Daten-Sperrverhalten trat aufgrund ...
4        Dear Customer Support, I am reaching out t

####Tokenizer

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 15000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df["Final_Text"])

####Padding

In [9]:
X = pad_sequences(tokenizer.texts_to_sequences(df["Final_Text"]), maxlen=max_len)

In [10]:
X

array([[    0,     0,     0, ...,   223,    11,   346],
       [    0,     0,     0, ...,    87,     5,   127],
       [    0,     0,     0, ...,     1,  1860,  2057],
       ...,
       [    0,     0,     0, ...,     6,    12,    35],
       [    0,     0,     0, ...,    16,    11,   252],
       [    0,     0,     0, ...,    22,  1020, 11081]], dtype=int32)

####Lable encoding

In [11]:
dummy_ys = pd.get_dummies(df["queue"])
queue_names = dummy_ys.columns.tolist()
y = dummy_ys.values

print(y, "\n")
print(dummy_ys.head())

[[False False  True ... False False False]
 [False  True False ... False False False]
 [False  True False ... False False False]
 ...
 [ True False False ... False False False]
 [False False False ... False False False]
 [False False False ...  True False False]] 

   Billing and Payments  Customer Service  General Inquiry  Human Resources  \
0                 False             False             True            False   
1                 False              True            False            False   
2                 False              True            False            False   
3                 False              True            False            False   
4                 False              True            False            False   

   IT Support  Product Support  Returns and Exchanges  Sales and Pre-Sales  \
0       False            False                  False                False   
1       False            False                  False                False   
2       False            

In [12]:
#Train test splitting
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print("X_train shape: ", X_train.shape)
print("X_val shape: ", X_val.shape)
print("y_train shape: ", y_train.shape)
print("y_val shape: ", y_val.shape)

X_train shape:  (16000, 200)
X_val shape:  (4000, 200)
y_train shape:  (16000, 10)
y_val shape:  (4000, 10)


In [14]:
print("X_train: \n", X_train, "\n")
print("y_train: \n", y_train, "\n")

X_train: 
 [[  0   0   0 ... 264 359 127]
 [  0   0   0 ... 181  11 856]
 [  0   0   0 ...  33  68 304]
 ...
 [  0   0   0 ...  18  14 376]
 [  0   0   0 ...  18  15 101]
 [  0   0   0 ... 540  39 256]] 

y_train: 
 [[False False False ... False  True False]
 [False False False ... False False  True]
 [ True False False ... False False False]
 ...
 [ True False False ... False False False]
 [False False False ... False False False]
 [ True False False ... False False False]] 



In [16]:
print(len(queue_names), "have been identified", "\n")
print("The queue names are: ", queue_names)

10 have been identified 

The queue names are:  ['Billing and Payments', 'Customer Service', 'General Inquiry', 'Human Resources', 'IT Support', 'Product Support', 'Returns and Exchanges', 'Sales and Pre-Sales', 'Service Outages and Maintenance', 'Technical Support']


####Model

In [18]:
model = tf.keras.Sequential([

    tf.keras.layers.Input(shape=(max_len,)),

    #embedding layers
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),

    #Dense layers
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(queue_names), activation="softmax")
])



In [19]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
model.summary()