In [61]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from catboost import Pool
import yaml

In [62]:
SEED: int = 666
TEST_SIZE: float = 0.2
DATA: str = 'data.csv'
ITERATIONS: int = 100
EARLY_STOPPING_ROUNDS: int = 200
L2_LEAF_REG: int = 50
DEPTH: int = 6
LEARNING_RATE: float = 0.05

In [63]:
df = pd.read_csv(DATA, index_col=0)
df

Unnamed: 0,target,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [64]:
target = 'target'
features = list(df.columns)
features.remove(target)
X = df[features]
y = df[target]

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.reshape(-1), test_size=TEST_SIZE, random_state=SEED)

train_pool = Pool(
    data = X_train,
    label = y_train,
)

test_pool = Pool(
    data = X_test,
    label = y_test,
)

In [66]:
cb = CatBoostClassifier(
                            iterations=ITERATIONS,
                            loss_function='MultiClass',
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                            l2_leaf_reg=L2_LEAF_REG,
                            depth=DEPTH,
                            learning_rate=LEARNING_RATE
                          )

In [67]:
cb.fit(train_pool,
          eval_set=test_pool,
          verbose=True,
          plot=False
          )

0:	learn: 1.0873284	test: 1.0867296	best: 1.0867296 (0)	total: 1.77ms	remaining: 176ms
1:	learn: 1.0765592	test: 1.0758048	best: 1.0758048 (1)	total: 3.06ms	remaining: 150ms
2:	learn: 1.0667290	test: 1.0672705	best: 1.0672705 (2)	total: 4.29ms	remaining: 139ms
3:	learn: 1.0558478	test: 1.0555548	best: 1.0555548 (3)	total: 5.54ms	remaining: 133ms
4:	learn: 1.0457024	test: 1.0442916	best: 1.0442916 (4)	total: 6.58ms	remaining: 125ms
5:	learn: 1.0316673	test: 1.0299373	best: 1.0299373 (5)	total: 7.77ms	remaining: 122ms
6:	learn: 1.0233519	test: 1.0218627	best: 1.0218627 (6)	total: 8.79ms	remaining: 117ms
7:	learn: 1.0140671	test: 1.0129761	best: 1.0129761 (7)	total: 9.79ms	remaining: 113ms
8:	learn: 1.0022755	test: 1.0009647	best: 1.0009647 (8)	total: 10.9ms	remaining: 110ms
9:	learn: 0.9925370	test: 0.9897978	best: 0.9897978 (9)	total: 11.9ms	remaining: 107ms
10:	learn: 0.9844693	test: 0.9811421	best: 0.9811421 (10)	total: 13.2ms	remaining: 107ms
11:	learn: 0.9770065	test: 0.9736141	best

<catboost.core.CatBoostClassifier at 0x172a66de190>

In [68]:
preds = cb.predict(X_test)
preds

array([['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['versicolor'],
       ['setosa'],
       ['virginica'],
       ['virginica'],
       ['virginica'],
       ['versicolor'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['virginica'],
       ['virginica']], dtype=object)

In [69]:
y_test

array(['versicolor', 'virginica', 'versicolor', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'setosa',
       'virginica', 'versicolor', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'virginica'], dtype=object)

In [70]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='micro')
recall = recall_score(y_test, preds, average='micro')
f1 = f1_score(y_test, preds, average='micro')

In [71]:
metrics = {}

metrics['accuracy cb'] = float(accuracy)
metrics['precision cb'] = float(precision)
metrics['recall cb'] = float(recall)
metrics['f1 cb'] = float(f1)

with open('./outputs/metrics_cb.yaml', 'w') as file:
    yaml.dump(metrics, file, default_flow_style=False)

In [72]:
metrics

{'accuracy cb': 1.0, 'precision cb': 1.0, 'recall cb': 1.0, 'f1 cb': 1.0}

In [73]:
preds = [i[0] for i in preds]
preds

['versicolor',
 'virginica',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'virginica',
 'versicolor',
 'setosa',
 'virginica',
 'virginica',
 'virginica',
 'versicolor',
 'setosa',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'virginica',
 'virginica']

In [74]:
test_classes = pd.DataFrame()

test_classes['actual_class'] = y_test
test_classes['predicted_class'] = preds
test_classes

Unnamed: 0,actual_class,predicted_class
0,versicolor,versicolor
1,virginica,virginica
2,versicolor,versicolor
3,virginica,virginica
4,setosa,setosa
5,versicolor,versicolor
6,versicolor,versicolor
7,virginica,virginica
8,versicolor,versicolor
9,versicolor,versicolor


In [75]:
train_classes = pd.DataFrame()

train_classes['actual_class'] = y_train
train_classes['predicted_class'] = [i[0] for i in cb.predict(X_train)]
train_classes

Unnamed: 0,actual_class,predicted_class
0,setosa,setosa
1,setosa,setosa
2,versicolor,versicolor
3,virginica,virginica
4,versicolor,versicolor
...,...,...
115,versicolor,versicolor
116,versicolor,versicolor
117,versicolor,virginica
118,versicolor,versicolor


In [76]:
test_classes.to_csv('outputs/test_classes_cb.csv', index=False)
train_classes.to_csv('outputs/train_classes_cb.csv', index=False)

In [77]:
import pickle

with open('./outputs/model_cb.pckl', 'wb') as file:
    pickle.dump(cb, file)