In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
# Same train-test split as RMarkdown
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Convert data from str or int to categorical
for col in train.columns:
    train[col] = pd.Categorical(train[col])
for col in test.columns:
    test[col] = pd.Categorical(test[col])

X_train = train.iloc[:, 1:]
X_test = test.iloc[:, 1:]
y_train = train['food_class']
y_test = test['food_class']

In [4]:
%%capture
# Hide output

# List categorical features
cat_features = ['food_class_l1', 'food_class_l2', 'food_class_l3', 'food_class_l4',
                'meal', 'semester', 'week_of_sem', 'dow', 'breakfast_or_not', 'gym_or_not']

# iterations: Number of trees (iterations) during training
# learning_rate: Step size where the gradient boosting algorithm converges to the optimal solution
# depth: Maximum depth of the trees
# MultiClass: Softmax function
# eval_metric: Evaluation metric
cboost = CatBoostClassifier(iterations=300,
                            learning_rate=0.01,
                            depth=10,
                            loss_function='MultiClass',
                            eval_metric='Accuracy',
                            random_seed=42)
cboost.fit(X_train, y_train, cat_features=cat_features)

cboost_pred = cboost.predict(X_test)

In [5]:
accuracy_score(y_test, cboost_pred)

0.7582417582417582

In [6]:
# This is the transpose of caret
confusion_matrix(y_test, cboost_pred)

array([[17,  5,  0],
       [ 3, 47,  0],
       [ 8,  6,  5]], dtype=int64)

In [None]:
# Bayesian optimization
from skopt import BayesSearchCV

param_space = {
    'iterations': (100, 1000),           # Number of trees
    'learning_rate': (0.01, 0.3),        # Learning rate
    'depth': (3, 10),                     # Depth of trees
    'l2_leaf_reg': (1, 10),               # L2 regularization
    'random_strength': (0.1, 1.0)        # Random strength
}

cat_features = ['food_class_l1', 'food_class_l2', 'food_class_l3', 'food_class_l4',
                'meal', 'semester', 'week_of_sem', 'dow', 'breakfast_or_not', 'gym_or_not']

catboost_model = CatBoostClassifier()

bayes_search = BayesSearchCV(
    estimator=catboost_model,
    search_spaces=param_space,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    n_iter=20,  # Number of iterations (evaluations)
    random_state=42
)

bayes_search.fit(X_train, y_train, cat_features=cat_features)

bayes_search.best_params_