In [4]:
%run config.py

In [5]:
data = pd.read_csv('Data/customer_support_tickets.csv')

In [8]:
warnings.filterwarnings("ignore", message="is_sparse is deprecated and will be removed in a future version.*")

categorical_features = ['Customer Gender', 'Product Purchased']
numerical_features = ['Customer Age']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the base model
model = RandomForestClassifier(random_state=42)

# Create a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Define the parameter grid to search
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Splitting dataset into training and testing sets
X = data[['Customer Age', 'Customer Gender', 'Product Purchased']]
y = data['Ticket Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameter set found on development set
print("Best parameters found: ", grid_search.best_params_)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Make predictions with the best estimator
preds = best_clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, preds))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found:  {'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 300}
                      precision    recall  f1-score   support

     Billing inquiry       0.27      0.11      0.15       348
Cancellation request       0.22      0.25      0.24       321
     Product inquiry       0.23      0.12      0.15       342
      Refund request       0.20      0.28      0.23       337
     Technical issue       0.22      0.35      0.27       346

            accuracy                           0.22      1694
           macro avg       0.23      0.22      0.21      1694
        weighted avg       0.23      0.22      0.21      1694

[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=   2.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators

[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   1.4s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time=   4.0s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=100; total time=   0.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=200; total time=   1.1s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=300; total time=   1.9s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time=   2.3s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time=   2.2s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_sp

[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=   2.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time=   0.8s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time=   1.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time=   2.4s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.9s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time=   1.6s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time=   0.7s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split

[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=   2.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time=   0.8s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time=   1.7s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=300; total time=   2.3s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.8s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=200; total time=   1.5s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=100; total time=   0.7s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split

[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   1.4s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=300; total time=   3.9s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=100; total time=   0.6s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=200; total time=   1.2s
[CV] END model__max_depth=None, model__min_samples_leaf=1, model__min_samples_split=10, model__n_estimators=300; total time=   1.8s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=300; total time=   2.4s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=300; total time=   2.2s
[CV] END model__max_depth=None, model__min_samples_leaf=2, model__min_samples_sp

In [30]:
# Gradient Boosting Classifier model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

clf_gb = Pipeline(steps=[('preprocessor', preprocessor),
                         ('model', gb_model)])

clf_gb.fit(X_train, y_train)

preds_gb = clf_gb.predict(X_test)

print(classification_report(y_test, preds_gb))


                      precision    recall  f1-score   support

     Billing inquiry       0.25      0.15      0.19       348
Cancellation request       0.19      0.19      0.19       321
     Product inquiry       0.20      0.12      0.15       342
      Refund request       0.20      0.26      0.23       337
     Technical issue       0.24      0.35      0.28       346

            accuracy                           0.21      1694
           macro avg       0.21      0.21      0.21      1694
        weighted avg       0.21      0.21      0.21      1694

