In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tobiasbueck/multilingual-customer-support-tickets")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tobiasbueck/multilingual-customer-support-tickets?dataset_version_number=12...


100%|██████████| 16.1M/16.1M [00:00<00:00, 122MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tobiasbueck/multilingual-customer-support-tickets/versions/12


In [2]:
import pandas as pd

df = pd.read_csv(path + "/aa_dataset-tickets-multi-lang-5-2-50-version.csv")
df.head()

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,Technical Support,high,de,51,Security,Outage,Disruption,Data Breach,,,,
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51,Account,Disruption,Outage,IT,Tech Support,,,
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,Returns and Exchanges,medium,en,51,Product,Feature,Tech Support,,,,,
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,Billing and Payments,low,en,51,Billing,Payment,Account,Documentation,Feedback,,,
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,Sales and Pre-Sales,medium,en,51,Product,Feature,Feedback,Tech Support,,,,


In [3]:
df = df[df['language'] == 'en']
df = df[['subject', 'body', 'priority']]
df.dropna(subset=['subject'], inplace=True)
df['full_text'] = df['body']

### Text Preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

# --- Simplified & More Direct NLTK Data Downloads ---
# This approach directly calls the downloader for each package to ensure
# they are correctly installed and registered in your environment.
try:
    print("Downloading NLTK resources...")
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('averaged_perceptron_tagger_eng', quiet=True)
    print("All necessary NLTK resources are downloaded.")
except Exception as e:
    print(f"An error occurred during NLTK download: {e}")


# --- Initialization ---
lemmatizer = WordNetLemmatizer()

# --- Stop Words Strategy ---
stop_words = set(stopwords.words('english'))
negations = {'no', 'not', 'nor', 'neither', "don't", "isn't", "wasn't", "shouldn't", "wouldn't", "couldn't"}
useless_words = {'dear', 'customer', 'support', 'team', 'NaN', 'null', 'NA', 'issue'}
stop_words = stop_words.union(useless_words)
stop_words = stop_words - negations

# --- Helper Function for POS-aware Lemmatization ---
def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if not treebank_tag: # Added this check
        return wordnet.NOUN # Default to noun if no tag is found
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# --- The Main Preprocessing Function ---
def perfect_preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'(https?|ftp)://[^\s/$.?#].[^\s]*|www\.\S+|(\S+@\S+)|([a-z]:\\[^\s:]+)', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'\b(?:\d[ -]?){6,12}\d\b', ' ', text)
    text = re.sub(r'\b[a-z0-9]{20,}\b', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    processed_tokens = [
        word for word in lemmatized_tokens if word not in stop_words and len(word) > 2
    ]
    return " ".join(processed_tokens).strip()

# Apply the new, perfected function
df['processed_text'] = df['full_text'].apply(perfect_preprocess_text)

df[['full_text', 'processed_text']].head()

Downloading NLTK resources...
All necessary NLTK resources are downloaded.


Unnamed: 0,full_text,processed_text
1,"Dear Customer Support Team,\n\nI am writing to...",write report significant problem centralized a...
2,"Dear Customer Support Team,\n\nI hope this mes...",hope message reach well reach request detailed...
3,"Dear Customer Support Team,\n\nI hope this mes...",hope message find well reach request clarifica...
4,"Dear Support Team,\n\nI hope this message reac...",hope message reach well reach ask compatibilit...
5,"Dear Customer Support,\n\nI hope this message ...",hope message reach good health eager learn fea...


### Splitting the Data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'],
    df['priority'],
    test_size=0.2,
    random_state=42,
    stratify=df['priority']
)

### Advanced Vectorization, Hyperparameter Tuning, and Training

In [None]:
#!/usr/bin/env python3
# Updated training script that saves reports in the "old" JSON structure.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import json
import pickle
import os

# Make sure X_train, y_train, X_test, y_test are already defined in your environment
# e.g., X_train, X_test are lists/arrays of text; y_train, y_test are label arrays (strings or ints)

os.makedirs('models', exist_ok=True)
os.makedirs('reports', exist_ok=True)

vectorizers = {
    'bow': CountVectorizer(),
    'ngram': CountVectorizer(),
    'tfidf': TfidfVectorizer()
}

classifiers = {
    'CalibratedSVC': CalibratedClassifierCV(LinearSVC(dual=False, max_iter=10000)),
    'CalibratedSGD': CalibratedClassifierCV(SGDClassifier(max_iter=1000, tol=1e-3)),
    'LightGBM': lgb.LGBMClassifier(),
    'ExtraTrees': ExtraTreesClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'MultinomialNB': MultinomialNB(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

param_grids = {
    'bow': {
        'vect__max_df': [0.75, 1.0],
        'vect__min_df': [1, 5]
    },
    'ngram': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 1.0]
    },
    'tfidf': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 1.0]
    },
    'CalibratedSVC': {'clf__estimator__C': [0.1, 1]},
    'CalibratedSGD': {'clf__estimator__alpha': [1e-4, 1e-3]},
    'LightGBM': {'clf__n_estimators': [50, 100], 'clf__learning_rate': [0.1, 0.2]},
    'ExtraTrees': {'clf__n_estimators': [50, 100]},
    'LogisticRegression': {'clf__C': [0.1, 1]},
    'MultinomialNB': {'clf__alpha': [0.5, 1.0]},
    'RandomForest': {'clf__n_estimators': [50, 100]},
    'XGBoost': {'clf__n_estimators': [50, 100], 'clf__learning_rate': [0.1, 0.2]}
}

# Helper functions for JSON-friendly conversion
def to_json_friendly(obj):
    """Recursively convert numpy types, tuples, etc -> native Python types for JSON."""
    if isinstance(obj, dict):
        return {to_json_friendly(k): to_json_friendly(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [to_json_friendly(x) for x in obj]
    if isinstance(obj, np.ndarray):
        return to_json_friendly(obj.tolist())
    if isinstance(obj, (np.integer,)):
        return int(obj)
    if isinstance(obj, (np.floating,)):
        return float(obj)
    if obj is None:
        return None
    # sklearn sometimes stores objects (estimators) in params; attempt best-effort repr
    # but for best_params_ we generally expect simple scalars/tuples -> convert tuples -> lists
    if isinstance(obj, tuple):
        return [to_json_friendly(x) for x in obj]
    try:
        json.dumps(obj)
        return obj
    except (TypeError, OverflowError):
        return repr(obj)

def normalize_support(support_value):
    """Convert support to int if possible; otherwise None."""
    if support_value is None:
        return None
    try:
        # support may be float-like (e.g., 1061.0) or numpy numbers
        return int(round(float(support_value)))
    except Exception:
        return None

# Named function used in pipeline when converting to float32
def to_float32(x):
    return x.astype(np.float32)

# Encode the target variable for models that need integer labels (XGBoost/LightGBM)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

for vec_name, vectorizer in vectorizers.items():
    for clf_name, classifier in classifiers.items():
        print(f'Training {clf_name} with {vec_name}...')

        steps = [('vect', vectorizer)]
        if clf_name in ['LightGBM', 'XGBoost'] and vec_name in ['bow', 'ngram']:
             steps.append(('to_float', FunctionTransformer(to_float32, validate=False)))
        steps.append(('clf', classifier))

        pipeline = Pipeline(steps)

        # Merge parameter grids (vectorizer params + classifier params)
        grid_params = {}
        grid_params.update(param_grids.get(vec_name, {}))
        grid_params.update(param_grids.get(clf_name, {}))

        # Setup GridSearchCV
        grid_search = GridSearchCV(pipeline, grid_params, cv=3, n_jobs=-1, verbose=1, scoring='f1_macro')

        # Train using encoded labels for XGBoost/LightGBM and original labels for others
        if clf_name in ['XGBoost', 'LightGBM']:
            grid_search.fit(X_train, y_train_encoded)
            # predictions will be encoded ints -> convert back to original labels
            y_pred_encoded = grid_search.best_estimator_.predict(X_test)
            # ensure dtype is integer for inverse_transform
            y_pred_encoded = np.asarray(y_pred_encoded, dtype=int)
            y_pred = label_encoder.inverse_transform(y_pred_encoded)
            # predict_proba (if available) - leave unused besides checking support
            y_pred_proba = grid_search.best_estimator_.predict_proba(X_test) if hasattr(grid_search.best_estimator_, "predict_proba") else None
        else:
            grid_search.fit(X_train, y_train)
            y_pred = grid_search.best_estimator_.predict(X_test)
            y_pred_proba = grid_search.best_estimator_.predict_proba(X_test) if hasattr(grid_search.best_estimator_, "predict_proba") else None

        best_model = grid_search.best_estimator_

        # Compute metrics (use original y_test and y_pred which are decoded to original labels)
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

        # sklearn classification_report as dict (we'll reformat to match your old structure)
        raw_cr = classification_report(y_test, y_pred, output_dict=True)

        # Build classification report in "old" format:
        # - exclude 'accuracy' key inside classification report (accuracy stored in Test metrics)
        # - ensure support values are int
        classification_report_old = {}
        for k, v in raw_cr.items():
            if k == "accuracy":
                # skip it here; test metrics will hold accuracy
                continue
            if isinstance(v, dict):
                # expected keys: precision, recall, f1-score, support
                entry = {}
                if "precision" in v:
                    entry["precision"] = float(v["precision"]) if v["precision"] is not None else None
                if "recall" in v:
                    entry["recall"] = float(v["recall"]) if v["recall"] is not None else None
                if "f1-score" in v:
                    entry["f1-score"] = float(v["f1-score"]) if v["f1-score"] is not None else None
                if "support" in v:
                    entry["support"] = normalize_support(v.get("support"))
                else:
                    entry["support"] = None
                classification_report_old[k] = entry
            else:
                # unexpected shape -> copy as-is (but attempt conversion)
                classification_report_old[k] = to_json_friendly(v)

        # Ensure macro avg and weighted avg exist and have integer supports if possible
        if "macro avg" not in classification_report_old:
            classification_report_old["macro avg"] = {
                "precision": precision,
                "recall": recall,
                "f1-score": f1,
                "support": None
            }
        else:
            # normalize support
            if isinstance(classification_report_old["macro avg"], dict):
                classification_report_old["macro avg"]["support"] = normalize_support(classification_report_old["macro avg"].get("support"))

        if "weighted avg" not in classification_report_old:
            classification_report_old["weighted avg"] = {
                "precision": None,
                "recall": None,
                "f1-score": None,
                "support": None
            }
        else:
            if isinstance(classification_report_old["weighted avg"], dict):
                classification_report_old["weighted avg"]["support"] = normalize_support(classification_report_old["weighted avg"].get("support"))

        # We'll set Confusion matrix to None to match prior structure (JSON null)
        conf_matrix_old = None

        # Make best_params JSON friendly (convert tuples -> lists, numpy types -> native)
        best_params_clean = to_json_friendly(grid_search.best_params_)

        report = {
            "Best CV score": float(grid_search.best_score_) if grid_search.best_score_ is not None else None,
            "Best params": best_params_clean,
            "Test metrics": {
                "accuracy": float(accuracy),
                "precision_macro": float(precision),
                "recall_macro": float(recall),
                "f1_macro": float(f1)
            },
            "Classification report": classification_report_old,
            "Confusion matrix": conf_matrix_old,
            "Supports predict_proba": bool(hasattr(best_model, "predict_proba"))
        }

        # Save report (pretty printed)
        report_filename = f'reports/{vec_name}_{clf_name}_report.json'
        with open(report_filename, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=4, ensure_ascii=False)

        # Save model
        model_filename = f'models/{vec_name}_{clf_name}_model.pkl'
        with open(model_filename, 'wb') as f:
            pickle.dump(best_model, f)

        print(f'Report saved to {report_filename}')
        print(f'Model saved to {model_filename}\n')


In [None]:
!zip -r models.zip models

## Summary:

### Model Training and Evaluation

* The notebook now employs a comprehensive pipeline for vectorizing text data using **BOW, N-grams, and TF-IDF** techniques, followed by hyperparameter tuning and training of multiple classifiers including **CalibratedSVC, CalibratedSGD, LightGBM, ExtraTrees, Logistic Regression, MultinomialNB, RandomForest, and XGBoost**.
* **GridSearchCV** is utilized for systematic hyperparameter tuning to find the best parameters for each model and vectorizer combination, optimizing for the 'f1_macro' score.
* After training, each model is evaluated on the test set, and a detailed report is generated including the best cross-validation score, best parameters, test metrics (accuracy, precision, recall, F1-score), a classification report, and a confusion matrix.
* The trained models are saved using `joblib` for persistence, and the evaluation reports are saved as JSON files, ensuring that the results are reproducible and can be easily accessed for further analysis.

### Next Steps

* The saved JSON reports can now be analyzed to compare the performance of all model-vectorization combinations to identify the top-performing approach for this specific text classification task.
* The saved models can be loaded for inference on new, unseen data, or for deployment in a production environment.