In [30]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score


In [37]:
df = pd.read_csv('../data/customer_support_tickets_cleaned.csv')

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text_cleaned'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')


Training set size: 396
Test set size: 99


In [39]:
dummy = DummyClassifier(strategy='most_frequent') # Using most frequent class strategy by initialization Majority Classifier

dummy.fit(X_train, y_train) # fit the model

y_pred = dummy.predict(X_test) # make predictions

print("Dummy Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred)) 

Dummy Classifier Performance:
Accuracy: 0.26262626262626265
F1 Score: 0.10925252525252524
Confusion Matrix:
[[ 0  0 24  0]
 [ 0  0 24  0]
 [ 0  0 26  0]
 [ 0  0 25  0]]


In [32]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') # Create TF-IDF vectorizer to vectorize text data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

lr = LogisticRegression(max_iter=1000) # Initialize Logistic Regression model training
lr.fit(X_train_vec, y_train) # Fit the model to the training data

y_pred_lr = lr.predict(X_test_vec) # Make predictions on the test data

acc_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')
cm_lr = confusion_matrix(y_test, y_pred_lr)

print("TF-IDF + vectorizer + Logistic Regression model trained.")
print("Logistic Regression Performance:")
print(f"Accuracy: {acc_lr}")
print(f"F1 Score: {f1_lr}")
print(f"Confusion Matrix:\n {cm_lr}")

print(f"Classification Report:\n {classification_report(y_test, y_pred_lr)}")



TF-IDF + vectorizer + Logistic Regression model trained.
Logistic Regression Performance:
Accuracy: 0.9494949494949495
F1 Score: 0.9499376443752823
Confusion Matrix:
 [[20  4  0  0]
 [ 0 24  0  0]
 [ 0  0 26  0]
 [ 0  1  0 24]]
Classification Report:
               precision    recall  f1-score   support

     account       1.00      0.83      0.91        24
     billing       0.83      1.00      0.91        24
       other       1.00      1.00      1.00        26
   technical       1.00      0.96      0.98        25

    accuracy                           0.95        99
   macro avg       0.96      0.95      0.95        99
weighted avg       0.96      0.95      0.95        99



In [33]:
baseline_results = pd.DataFrame({
    'Model': ['Dummy Classifier', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, y_pred), accuracy_score(y_test, y_pred_lr)],
    'F1 Score': [f1_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred_lr, average='weighted')]
})
print("\nBaseline Model Comparison:")
print(baseline_results)


Baseline Model Comparison:
                 Model  Accuracy  F1 Score
0     Dummy Classifier  0.262626  0.109253
1  Logistic Regression  0.949495  0.949938


In [34]:
#saving metrics to json file
#preparing metrics dictionary
metrics = {
    'Dummy Classifier': {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred, average='weighted'),
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
    },
    'tfidf + logreg': {
        'Accuracy': acc_lr,
        'F1 Score': f1_lr,
        'confusion_matrix': cm_lr.tolist()
    }
}

with open('../src/baseline_model_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)

    print("Baseline model metrics saved to '../src/baseline_model_metrics.json'")


Baseline model metrics saved to '../src/baseline_model_metrics.json'


In [35]:
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr)) 

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

     account       1.00      0.83      0.91        24
     billing       0.83      1.00      0.91        24
       other       1.00      1.00      1.00        26
   technical       1.00      0.96      0.98        25

    accuracy                           0.95        99
   macro avg       0.96      0.95      0.95        99
weighted avg       0.96      0.95      0.95        99

