In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, f1_score
import mlflow
import mlflow.tensorflow
import re

In [2]:
DATA_PATH = "../data/raw/tickets.csv"
df = pd.read_csv(DATA_PATH)

df.shape

(28587, 16)

In [3]:
# selecting only english
df = df[df["language"] == "en"].reset_index(drop=True)
df.shape

(16338, 16)

In [4]:
df = df[["subject", "body", "type"]]
df.isnull().sum()

subject    2607
body          0
type          0
dtype: int64

In [6]:
df[["subject","body","type"]].head()

Unnamed: 0,subject,body,type
0,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Incident
1,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Request
2,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Request
3,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Problem
4,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Request


In [8]:
def clean_text(text):
    text = str(text).lower()
    text = text.replace("\\n"," ")
    text = re.sub(r"<.?>"," ",text)
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [9]:
df['text'] = (
    df["subject"].fillna("") + " " + df["body"].fillna("")
)

df["text"] = df["text"].apply(clean_text)
df.head()

Unnamed: 0,subject,body,type,text
0,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Incident,account disruption dear customer support team ...
1,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Request,query about smart home system integration feat...
2,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Request,inquiry regarding invoice details dear custome...
3,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Problem,question about marketing agency software compa...
4,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Request,feature query dear customer support i hope thi...


In [10]:
df_clean = df.drop(columns=['subject','body']).copy()

In [11]:
df_clean.head()

Unnamed: 0,type,text
0,Incident,account disruption dear customer support team ...
1,Request,query about smart home system integration feat...
2,Request,inquiry regarding invoice details dear custome...
3,Problem,question about marketing agency software compa...
4,Request,feature query dear customer support i hope thi...


In [13]:
label_encoder = LabelEncoder()
df_clean["label"] = label_encoder.fit_transform(df_clean["type"])

num_classes = df_clean["label"].nunique()
num_classes

4

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df_clean["text"],
    df_clean["label"],
    test_size=0.2,
    random_state=42,
    stratify=df_clean["label"]
)

In [20]:
MAX_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

In [22]:
EMBEDDING_DIM = 128

model = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM),
    LSTM(128),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.build(input_shape=(None, MAX_LEN))
model.summary()

In [23]:
mlflow.set_experiment("SmartSupport-LSTM-Research")

with mlflow.start_run(run_name="lstm_english_baseline"):

    mlflow.log_param("model_type", "LSTM")
    mlflow.log_param("language", "en")
    mlflow.log_param("max_words", MAX_WORDS)
    mlflow.log_param("max_len", MAX_LEN)
    mlflow.log_param("embedding_dim", EMBEDDING_DIM)
    mlflow.log_param("epochs", 5)
    mlflow.log_param("batch_size", 64)

    history = model.fit(
        X_train_pad,
        y_train,
        validation_split=0.1,
        epochs=5,
        batch_size=64,
        verbose=1
    )

    y_pred = np.argmax(model.predict(X_test_pad), axis=1)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("macro_f1", f1)

    mlflow.tensorflow.log_model(model, artifact_path="lstm_model")

2025/12/23 11:10:00 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/23 11:10:00 INFO mlflow.store.db.utils: Updating database tables
2025/12/23 11:10:00 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/23 11:10:00 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/23 11:10:00 INFO alembic.runtime.migration: Running 

Epoch 1/5
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 106ms/step - accuracy: 0.5529 - loss: 1.0526 - val_accuracy: 0.6526 - val_loss: 0.8764
Epoch 2/5
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 103ms/step - accuracy: 0.4476 - loss: 1.1778 - val_accuracy: 0.4208 - val_loss: 1.2656
Epoch 3/5
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 106ms/step - accuracy: 0.3927 - loss: 1.2719 - val_accuracy: 0.4239 - val_loss: 1.2675
Epoch 4/5
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 104ms/step - accuracy: 0.4160 - loss: 1.2582 - val_accuracy: 0.6886 - val_loss: 0.9946
Epoch 5/5
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 105ms/step - accuracy: 0.4729 - loss: 1.1831 - val_accuracy: 0.6526 - val_loss: 0.8326
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step


