In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score
import joblib
import re

In [2]:
df = pd.read_csv("../tickets_200.csv")
df.head()

Unnamed: 0,ticket,category
0,User cannot login to Active Directory,Active Directory Issue
1,Password reset required for user,Active Directory Issue
2,Account locked out from AD,Active Directory Issue
3,AD group membership not updating,Active Directory Issue
4,User unable to access shared drive,Active Directory Issue


In [3]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_ticket"] = df["ticket"].astype(str).apply(clean_text)
df.head()

Unnamed: 0,ticket,category,clean_ticket
0,User cannot login to Active Directory,Active Directory Issue,user cannot login to active directory
1,Password reset required for user,Active Directory Issue,password reset required for user
2,Account locked out from AD,Active Directory Issue,account locked out from ad
3,AD group membership not updating,Active Directory Issue,ad group membership not updating
4,User unable to access shared drive,Active Directory Issue,user unable to access shared drive


In [4]:
X = df["clean_ticket"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

len(X_train), len(X_test)

(160, 40)

In [5]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

((160, 243), (40, 243))

In [6]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=300, n_jobs=-1),
    "LinearSVC": LinearSVC(),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
}

results = []

for name, clf in models.items():
    clf.fit(X_train_vec, y_train)
    preds = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)

    print("=" * 60)
    print(name)
    print("Accuracy:", acc)
    print(classification_report(y_test, preds))

    results.append((name, acc, clf))

results

LogisticRegression
Accuracy: 1.0
                        precision    recall  f1-score   support

Active Directory Issue       1.00      1.00      1.00         5
           Email Issue       1.00      1.00      1.00         4
        Firewall Issue       1.00      1.00      1.00         4
        Hardware Issue       1.00      1.00      1.00         5
             MDM Issue       1.00      1.00      1.00         4
         Network Issue       1.00      1.00      1.00         5
         Printer Issue       1.00      1.00      1.00         4
        Security Issue       1.00      1.00      1.00         4
          Server Issue       1.00      1.00      1.00         5

              accuracy                           1.00        40
             macro avg       1.00      1.00      1.00        40
          weighted avg       1.00      1.00      1.00        40

LinearSVC
Accuracy: 1.0
                        precision    recall  f1-score   support

Active Directory Issue       1.00      1.00

[('LogisticRegression', 1.0, LogisticRegression(max_iter=300, n_jobs=-1)),
 ('LinearSVC', 1.0, LinearSVC()),
 ('RandomForest',
  1.0,
  RandomForestClassifier(n_estimators=200, random_state=42))]

In [7]:
best_name, best_acc, best_model = sorted(results, key=lambda x: x[1], reverse=True)[0]
print("Best model:", best_name, "with accuracy:", best_acc)

Best model: LogisticRegression with accuracy: 1.0


In [8]:
# Refit vectorizer on all data
X_all = df["clean_ticket"]
y_all = df["category"]

vectorizer_full = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

X_all_vec = vectorizer_full.fit_transform(X_all)

best_model.fit(X_all_vec, y_all)

# Save trained model to ROOT folder
joblib.dump(best_model, "../model.pkl")
joblib.dump(vectorizer_full, "../vectorizer.pkl")

print("Saved best model:", best_name)


Saved best model: LogisticRegression
