In [8]:
import math
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random

%matplotlib inline
plt.rcParams.update({
        'font.size': 14,
        'axes.titlesize': 15,
        'axes.labelsize': 15,
        'xtick.labelsize': 15,
        'ytick.labelsize': 15,
        'font.size': 15,
        'figure.figsize': (10, 7),
        'axes.grid': True,
        'grid.linestyle': '-',
        'grid.alpha': 0.3,
        'lines.markersize': 5.0,
        'xtick.minor.visible': True,
        'xtick.direction': 'in',
        'xtick.major.size': 20.0,
        'xtick.minor.size': 10.0,
        'xtick.top': False,
        'xtick.bottom': True,
        'ytick.minor.visible': True,
        'ytick.direction': 'in',
        'ytick.major.size': 12.0,
        'ytick.minor.size': 6.0,
        'ytick.right': True,
        'errorbar.capsize': 0.0,
    })

# https://www.kaggle.com/competitions/practical-ml-chocolate/data
# https://www.kaggle.com/datasets/rtatman/chocolate-bar-ratings/

In [29]:
#!pip install catboost
#!pip install optuna

In [9]:
!git branch

  main[m
* [32mmodels[m


# 1 Get data

In [10]:
df_train = pd.read_csv('chocolate_train.csv')
df_test = pd.read_csv('chocolate_test_new.csv')

In [11]:
def preprocessing_step1(data):
    data_new = data.copy()
    data_new['Cocoa Percent'] = data_new['Cocoa Percent'].apply(lambda x: float(x.strip('%')))
    data_new = data_new.fillna("Unknown")
    data_new['Broad Bean Origin'] = data_new['Broad Bean Origin'].replace('\xa0', "Unknown")
    data_new['Bean Type'] = data_new['Bean Type'].replace('\xa0', "Unknown")
    data_new.replace(
    {
        'Eucador': 'Ecuador',
        'Domincan Republic': 'Dominican Republic',
        'Niacragua': 'Nicaragua'
    },
    inplace=True)
    return data_new

In [12]:
df_train = preprocessing_step1(df_train)
df_test = preprocessing_step1(df_test)

In [13]:
df_train.head()

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72.0,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72.0,Switzerland,3.5,Unknown,Venezuela
2,Dark Forest,Tanzania,1554,2015,70.0,U.S.A.,3.0,Unknown,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72.0,U.S.A.,3.0,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75.0,France,3.5,Criollo,Indonesia


In [14]:
(df_train == "Unknown").sum()

Company                   0
Specific Bean Origin      0
REF                       0
Review                    0
Cocoa Percent             0
Company Location          0
Rating                    0
Bean Type               629
Broad Bean Origin        56
dtype: int64

# 2 Models

In [30]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import optuna

## 2.1 Broad Bean Origin

### 2.1.1  Split to train and test

In [15]:
# Split data to train (with no "Unknown" cells) and test (with "Unknown" cells)

df_train_encoder_train = df_train[(df_train['Broad Bean Origin'] != 'Unknown')]\
    .drop(['Bean Type', 'Rating'], axis=1)
df_train_encoder_test = df_train[(df_train['Broad Bean Origin'] == 'Unknown')]\
    .drop(['Bean Type', 'Rating'], axis=1)
df_test_encoder_train = df_test[(df_test['Broad Bean Origin'] != 'Unknown')]\
    .drop(['Bean Type'], axis=1)
df_test_encoder_test = df_test[(df_test['Broad Bean Origin'] == 'Unknown')]\
    .drop(['Bean Type'], axis=1)

print(df_train_encoder_train.shape)
print(df_train_encoder_test.shape)
print(df_test_encoder_train.shape)
print(df_test_encoder_test.shape)

X = pd.concat([
    df_train_encoder_train.drop('Broad Bean Origin', axis=1),
    df_test_encoder_train.drop('Broad Bean Origin', axis=1)
])

y = pd.concat([
    df_train_encoder_train['Broad Bean Origin'],
    df_test_encoder_train['Broad Bean Origin']
])

(1199, 7)
(56, 7)
(522, 7)
(18, 7)


In [17]:
# Train on top 11 countries
y.value_counts()[:11]

Venezuela             214
Ecuador               193
Dominican Republic    166
Peru                  165
Madagascar            145
Nicaragua              60
Brazil                 58
Bolivia                57
Belize                 49
Papua New Guinea       42
Colombia               40
Name: Broad Bean Origin, dtype: int64

In [20]:
# Most probable bean origins
top_bean_origins = y.value_counts()[:11].index
top_bean_origins_mask = y.apply(lambda x: x in top_bean_origins)

# train on top 10 Broad Bean Origins only
X_top = X[top_bean_origins_mask]
y_top = y[top_bean_origins_mask]

X_train, X_val, y_train, y_val = train_test_split(X_top, y_top, test_size=0.2)

categorical_features_indices = np.where(X_top.dtypes == object)[0]

In [24]:
def label_accuracy(y_true, y_pred):
    df_bbo = pd.DataFrame({'True BBO': y_true.values, 'Predicted BBO': y_pred[:, 0]})
    return (df_bbo['True BBO'] == df_bbo['Predicted BBO']).sum() / df_bbo.shape[0]

### 2.1.2 Broad Bean Origin: train CatBoost model

In [26]:
# CatBoost with default hyperparameters
model = CatBoostClassifier()

model.fit(X_train, y_train,
          cat_features=categorical_features_indices,
          logging_level='Silent')

pred = model.predict(X_val)

label_accuracy(y_val, pred)

0.6638655462184874

### 2.1.3 Optuna

In [31]:
def objective(trial):

    param = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, step=0.001)
    }

    estimator = CatBoostClassifier(**param,
                                   cat_features=categorical_features_indices,
                                   verbose=False)

    estimator.fit(X_train, y_train)
    pred = estimator.predict(X_val)

    return label_accuracy(y_val, pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, n_jobs=-1)
print(study.best_trial)

[I 2023-08-07 12:55:49,457] A new study created in memory with name: no-name-ce7614a1-97ba-4f08-8511-c959f4b94dc8
[I 2023-08-07 12:55:58,817] Trial 1 finished with value: 0.6596638655462185 and parameters: {'n_estimators': 302, 'max_depth': 2, 'learning_rate': 0.081}. Best is trial 1 with value: 0.6596638655462185.
[I 2023-08-07 12:56:08,274] Trial 9 finished with value: 0.6470588235294118 and parameters: {'n_estimators': 394, 'max_depth': 3, 'learning_rate': 0.022000000000000002}. Best is trial 1 with value: 0.6596638655462185.
[I 2023-08-07 12:56:16,857] Trial 14 finished with value: 0.6722689075630253 and parameters: {'n_estimators': 174, 'max_depth': 8, 'learning_rate': 0.031}. Best is trial 14 with value: 0.6722689075630253.
[I 2023-08-07 12:56:20,923] Trial 15 finished with value: 0.680672268907563 and parameters: {'n_estimators': 432, 'max_depth': 4, 'learning_rate': 0.055}. Best is trial 15 with value: 0.680672268907563.
[I 2023-08-07 12:56:23,389] Trial 7 finished with value: 

FrozenTrial(number=6, state=1, values=[0.6932773109243697], datetime_start=datetime.datetime(2023, 8, 7, 12, 55, 49, 512994), datetime_complete=datetime.datetime(2023, 8, 7, 12, 57, 2, 831546), params={'n_estimators': 382, 'max_depth': 7, 'learning_rate': 0.058}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=10, step=1), 'max_depth': IntDistribution(high=12, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.001, step=0.001)}, trial_id=6, value=None)


In [32]:
print(study.best_trial.params)

{'n_estimators': 382, 'max_depth': 7, 'learning_rate': 0.058}


In [33]:
model = CatBoostClassifier(**study.best_params,
                           cat_features=categorical_features_indices,
                           logging_level='Silent')

model.fit(X_train, y_train)
pred = model.predict(X_val)

label_accuracy(y_val, pred)

0.6932773109243697

In [34]:
pd.DataFrame({'Feature': model.feature_names_, "Importance": model.feature_importances_})\
    .sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
1,Specific Bean Origin,35.094427
5,Company Location,28.495665
0,Company,17.359681
2,REF,10.306868
4,Cocoa Percent,6.784059
3,Review,1.9593


In [35]:
# Fit on data with top 11 countries
model.fit(X_top, y_top)

pd.DataFrame({'Feature': model.feature_names_, "Importance": model.feature_importances_})\
    .sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
1,Specific Bean Origin,38.032739
5,Company Location,29.345539
0,Company,17.128539
2,REF,8.460626
4,Cocoa Percent,5.1167
3,Review,1.915857


In [36]:
# Apply to dataframes with "Unknown" Broad Bean Origin cells

pred_bbo_train = model.predict(df_train_encoder_test.drop(['Broad Bean Origin'], axis=1))
pred_bbo_test = model.predict(df_test_encoder_test.drop(['Broad Bean Origin'], axis=1))

In [37]:
# Indices of rows with "Unknown" value from intiinal datasets
index_train = df_train[(df_train['Broad Bean Origin'] == 'Unknown')].index
index_test = df_test[(df_test['Broad Bean Origin'] == 'Unknown')].index

# Get Broad Bean Origin column from initial datasets
df_train['Broad Bean Origin'] = pd.read_csv('chocolate_train.csv')['Broad Bean Origin']
df_test['Broad Bean Origin'] = pd.read_csv('chocolate_test_new.csv')['Broad Bean Origin']

# Replace values in BBO column with model output
df_train.loc[index_train, ['Broad Bean Origin']] = pred_bbo_train[:, 0]
df_test.loc[index_test, ['Broad Bean Origin']] = pred_bbo_test[:, 0]

In [38]:
# Save these new dataframes
df_train.to_csv("train_bbo.csv", index=False)
df_test.to_csv("test_bbo.csv", index=False)

In [40]:
import pickle

# Save the trained model as a pickle string.
with open('BBO_model_v1.pickle', 'wb') as f:
    pickle.dump(model, f)
    
model.save_model("BOO_catboost_v1")

## 2.2 Bean Type

### 2.2.1 split to train and test

In [44]:
# Split data to train (with no "Unknown" cells) and test (with "Unknown" cells)

df_train_encoder_train = df_train[(df_train['Bean Type'] != 'Unknown')]\
    .drop(['Rating'], axis=1)
df_train_encoder_test = df_train[(df_train['Bean Type'] == 'Unknown')]\
    .drop(['Rating'], axis=1)
df_test_encoder_train = df_test[(df_test['Bean Type'] != 'Unknown')]
df_test_encoder_test = df_test[(df_test['Bean Type'] == 'Unknown')]

print(df_train_encoder_train.shape)
print(df_train_encoder_test.shape)
print(df_test_encoder_train.shape)
print(df_test_encoder_test.shape)

X = pd.concat([
    df_train_encoder_train.drop('Bean Type', axis=1),
    df_test_encoder_train.drop('Bean Type', axis=1)
])

y = pd.concat([
    df_train_encoder_train['Bean Type'],
    df_test_encoder_train['Bean Type']
])

(626, 8)
(629, 8)
(281, 8)
(259, 8)


In [45]:
# Train using only 3-7 most popular types
y.value_counts()[:7]

Trinitario              419
Criollo                 153
Forastero                87
Forastero (Nacional)     52
Blend                    41
Criollo, Trinitario      39
Forastero (Arriba)       37
Name: Bean Type, dtype: int64

In [46]:
# Most probable bean origins
top_bean_types = y.value_counts()[:7].index
top_bean_types_mask = y.apply(lambda x: x in top_bean_types)

# train on top 10 Broad Bean Origins only
X_top = X[top_bean_types_mask]
y_top = y[top_bean_types_mask]

X_train, X_val, y_train, y_val = train_test_split(X_top, y_top, test_size=0.2)

categorical_features_indices = np.where(X_top.dtypes == object)[0]

### 2.2.2 Catboost model

In [47]:
# CatBoost with default hyperparameters
model = CatBoostClassifier()

model.fit(X_train, y_train,
          cat_features=categorical_features_indices,
          logging_level='Silent')

pred = model.predict(X_val)

label_accuracy(y_val, pred)

0.7771084337349398

### 2.2.3 Optuna

In [48]:
def objective(trial):

    param = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, step=0.001)
    }

    estimator = CatBoostClassifier(**param,
                                   cat_features=categorical_features_indices,
                                   verbose=False)

    estimator.fit(X_train, y_train)
    pred = estimator.predict(X_val)

    return label_accuracy(y_val, pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, n_jobs=-1)
print(study.best_trial)

[I 2023-08-07 13:26:27,677] A new study created in memory with name: no-name-b0af3246-8249-4601-b6ac-a43a289119fa
[I 2023-08-07 13:26:29,387] Trial 10 finished with value: 0.6566265060240963 and parameters: {'n_estimators': 65, 'max_depth': 3, 'learning_rate': 0.055}. Best is trial 10 with value: 0.6566265060240963.
[I 2023-08-07 13:26:31,244] Trial 4 finished with value: 0.7530120481927711 and parameters: {'n_estimators': 137, 'max_depth': 4, 'learning_rate': 0.029}. Best is trial 4 with value: 0.7530120481927711.
[I 2023-08-07 13:26:32,141] Trial 14 finished with value: 0.6927710843373494 and parameters: {'n_estimators': 95, 'max_depth': 7, 'learning_rate': 0.024}. Best is trial 4 with value: 0.7530120481927711.
[I 2023-08-07 13:26:32,176] Trial 0 finished with value: 0.7710843373493976 and parameters: {'n_estimators': 348, 'max_depth': 2, 'learning_rate': 0.08700000000000001}. Best is trial 0 with value: 0.7710843373493976.
[I 2023-08-07 13:26:32,478] Trial 18 finished with value: 0

FrozenTrial(number=0, state=1, values=[0.7710843373493976], datetime_start=datetime.datetime(2023, 8, 7, 13, 26, 27, 679573), datetime_complete=datetime.datetime(2023, 8, 7, 13, 26, 32, 176562), params={'n_estimators': 348, 'max_depth': 2, 'learning_rate': 0.08700000000000001}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=10, step=1), 'max_depth': IntDistribution(high=10, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=0.1, log=False, low=0.001, step=0.001)}, trial_id=0, value=None)


In [49]:
print(study.best_trial.params)

{'n_estimators': 348, 'max_depth': 2, 'learning_rate': 0.08700000000000001}


In [50]:
model = CatBoostClassifier(**study.best_params,
                           cat_features=categorical_features_indices,
                           logging_level='Silent')

model.fit(X_train, y_train)
pred = model.predict(X_val)

label_accuracy(y_val, pred)

0.7710843373493976

In [51]:
pd.DataFrame({'Feature': model.feature_names_, "Importance": model.feature_importances_})\
    .sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
6,Broad Bean Origin,66.207098
5,Company Location,9.674067
2,REF,9.188031
0,Company,6.179399
1,Specific Bean Origin,4.835454
3,Review,3.239825
4,Cocoa Percent,0.676126


In [52]:
model.fit(X_top, y_top)

pd.DataFrame({'Feature': model.feature_names_, "Importance": model.feature_importances_})\
    .sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
6,Broad Bean Origin,63.770696
5,Company Location,11.776458
2,REF,7.216055
1,Specific Bean Origin,6.794736
0,Company,5.993364
3,Review,4.232767
4,Cocoa Percent,0.215924


In [53]:
# Apply model to data with empty cells
pred_bt_train = model.predict(df_train_encoder_test.drop(['Bean Type'], axis=1))
pred_bt_test = model.predict(df_test_encoder_test.drop(['Bean Type'], axis=1))

In [54]:
# Replace values in Bean Type column with model output
df_train.loc[df_train_encoder_test.index, ['Bean Type']] = pred_bt_train[:, 0]
df_test.loc[df_test_encoder_test.index, ['Bean Type']] = pred_bt_test[:, 0]

# Save files
df_train.to_csv("train_bt.csv", index=False)
df_test.to_csv("test_bt.csv", index=False)

# Save model
model.save_model("BT_catboost_v1")

In [55]:
# Save files
df_train.to_csv("train_no_empty_cells.csv", index=False)
df_test.to_csv("test_no_empty_cells.csv", index=False)

# 3 Check new csv files