In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from catboost import Pool
import yaml

In [2]:
SEED: int = 666
TEST_SIZE: float = 0.2
DATA: str = 'data.csv'
ITERATIONS: int = 100
EARLY_STOPPING_ROUNDS: int = 200
L2_LEAF_REG: int = 50
DEPTH: int = 6
LEARNING_RATE: float = 0.05

In [3]:
# Parameters
SEED = 666
DATA = "data.csv"
TEST_SIZE = 0.93
ITERATIONS = 1000
LEARNING_RATE = 0.05
L2_LEAF_REG = 50
EARLY_STOPPING_ROUNDS = 200
DEPTH = 5


In [4]:
df = pd.read_csv(DATA, index_col=0)
df

Unnamed: 0,target,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [5]:
target = 'target'
features = list(df.columns)
features.remove(target)
X = df[features]
y = df[target]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.reshape(-1), test_size=TEST_SIZE, random_state=SEED)

train_pool = Pool(
    data = X_train,
    label = y_train,
)

test_pool = Pool(
    data = X_test,
    label = y_test,
)

In [7]:
cb = CatBoostClassifier(
                            iterations=ITERATIONS,
                            loss_function='MultiClass',
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                            l2_leaf_reg=L2_LEAF_REG,
                            depth=DEPTH,
                            learning_rate=LEARNING_RATE
                          )

In [8]:
cb.fit(train_pool,
          eval_set=test_pool,
          verbose=True,
          plot=False
          )

0:	learn: 1.0972408	test: 1.0976520	best: 1.0976520 (0)	total: 137ms	remaining: 2m 16s
1:	learn: 1.0954189	test: 1.0969024	best: 1.0969024 (1)	total: 138ms	remaining: 1m 8s
2:	learn: 1.0936010	test: 1.0959393	best: 1.0959393 (2)	total: 139ms	remaining: 46.1s
3:	learn: 1.0919359	test: 1.0951232	best: 1.0951232 (3)	total: 139ms	remaining: 34.7s
4:	learn: 1.0901912	test: 1.0939874	best: 1.0939874 (4)	total: 140ms	remaining: 27.9s
5:	learn: 1.0892224	test: 1.0934362	best: 1.0934362 (5)	total: 140ms	remaining: 23.3s
6:	learn: 1.0877951	test: 1.0926675	best: 1.0926675 (6)	total: 141ms	remaining: 20s
7:	learn: 1.0848707	test: 1.0914687	best: 1.0914687 (7)	total: 141ms	remaining: 17.5s
8:	learn: 1.0828959	test: 1.0907410	best: 1.0907410 (8)	total: 142ms	remaining: 15.6s
9:	learn: 1.0819971	test: 1.0900306	best: 1.0900306 (9)	total: 142ms	remaining: 14.1s
10:	learn: 1.0802051	test: 1.0888008	best: 1.0888008 (10)	total: 143ms	remaining: 12.8s
11:	learn: 1.0786708	test: 1.0878020	best: 1.0878020 

333:	learn: 0.6231115	test: 0.8637191	best: 0.8637191 (333)	total: 305ms	remaining: 608ms
334:	learn: 0.6220657	test: 0.8630606	best: 0.8630606 (334)	total: 305ms	remaining: 606ms
335:	learn: 0.6210316	test: 0.8622918	best: 0.8622918 (335)	total: 306ms	remaining: 604ms
336:	learn: 0.6197351	test: 0.8618438	best: 0.8618438 (336)	total: 306ms	remaining: 602ms
337:	learn: 0.6184432	test: 0.8615063	best: 0.8615063 (337)	total: 306ms	remaining: 600ms
338:	learn: 0.6178205	test: 0.8612326	best: 0.8612326 (338)	total: 307ms	remaining: 599ms
339:	learn: 0.6169906	test: 0.8610740	best: 0.8610740 (339)	total: 308ms	remaining: 597ms
340:	learn: 0.6161132	test: 0.8605247	best: 0.8605247 (340)	total: 308ms	remaining: 595ms
341:	learn: 0.6149642	test: 0.8601706	best: 0.8601706 (341)	total: 308ms	remaining: 593ms
342:	learn: 0.6136886	test: 0.8593652	best: 0.8593652 (342)	total: 309ms	remaining: 591ms
343:	learn: 0.6124176	test: 0.8592902	best: 0.8592902 (343)	total: 309ms	remaining: 589ms
344:	learn

656:	learn: 0.4054693	test: 0.7623379	best: 0.7623379 (656)	total: 460ms	remaining: 240ms
657:	learn: 0.4050529	test: 0.7620985	best: 0.7620985 (657)	total: 461ms	remaining: 239ms
658:	learn: 0.4047143	test: 0.7619605	best: 0.7619605 (658)	total: 461ms	remaining: 239ms
659:	learn: 0.4042989	test: 0.7617705	best: 0.7617705 (659)	total: 462ms	remaining: 238ms
660:	learn: 0.4038631	test: 0.7613461	best: 0.7613461 (660)	total: 462ms	remaining: 237ms
661:	learn: 0.4035446	test: 0.7612674	best: 0.7612674 (661)	total: 463ms	remaining: 236ms
662:	learn: 0.4031313	test: 0.7610789	best: 0.7610789 (662)	total: 463ms	remaining: 235ms
663:	learn: 0.4028147	test: 0.7609443	best: 0.7609443 (663)	total: 464ms	remaining: 235ms
664:	learn: 0.4023865	test: 0.7606401	best: 0.7606401 (664)	total: 464ms	remaining: 234ms
665:	learn: 0.4019755	test: 0.7606358	best: 0.7606358 (665)	total: 465ms	remaining: 233ms
666:	learn: 0.4015447	test: 0.7605898	best: 0.7605898 (666)	total: 465ms	remaining: 232ms
667:	learn

999:	learn: 0.2911717	test: 0.7428693	best: 0.7415586 (899)	total: 628ms	remaining: 0us

bestTest = 0.7415586456
bestIteration = 899

Shrink model to first 900 iterations.


<catboost.core.CatBoostClassifier at 0x1b360c08510>

In [9]:
preds = cb.predict(X_test)
preds

array([['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['setosa'],
       ['virginica'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['setosa'],
       ['seto

In [10]:
y_test

array(['versicolor', 'virginica', 'versicolor', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'setosa',
       'virginica', 'versicolor', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'virginica', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'versicolor',
       'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'setosa',
       'setosa', 'virginica', 'versicolor', 'versicolor', 'versicolor',
       'setosa', 'setosa', 'virginica', 'virginica', 'versicolor',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'versicolor', 'setosa', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', '

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='micro')
recall = recall_score(y_test, preds, average='micro')
f1 = f1_score(y_test, preds, average='micro')

In [12]:
metrics = {}

metrics['accuracy cb'] = float(accuracy)
metrics['precision cb'] = float(precision)
metrics['recall cb'] = float(recall)
metrics['f1 cb'] = float(f1)

with open('./outputs/metrics_cb.yaml', 'w') as file:
    yaml.dump(metrics, file, default_flow_style=False)

In [13]:
metrics

{'accuracy cb': 0.7, 'precision cb': 0.7, 'recall cb': 0.7, 'f1 cb': 0.7}

In [14]:
preds = [i[0] for i in preds]
preds

['versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'virginica',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor

In [15]:
test_classes = pd.DataFrame()

test_classes['actual_class'] = y_test
test_classes['predicted_class'] = preds
test_classes

Unnamed: 0,actual_class,predicted_class
0,versicolor,versicolor
1,virginica,versicolor
2,versicolor,versicolor
3,virginica,versicolor
4,setosa,setosa
...,...,...
135,setosa,setosa
136,setosa,setosa
137,virginica,virginica
138,virginica,versicolor


In [16]:
train_classes = pd.DataFrame()

train_classes['actual_class'] = y_train
train_classes['predicted_class'] = [i[0] for i in cb.predict(X_train)]
train_classes

Unnamed: 0,actual_class,predicted_class
0,setosa,setosa
1,setosa,setosa
2,versicolor,versicolor
3,setosa,setosa
4,versicolor,versicolor
5,versicolor,versicolor
6,versicolor,versicolor
7,versicolor,versicolor
8,versicolor,versicolor
9,virginica,virginica


In [17]:
test_classes.to_csv('outputs/test_classes_cb.csv', index=False)
train_classes.to_csv('outputs/train_classes_cb.csv', index=False)

In [18]:
import pickle

with open('./outputs/model_cb.pckl', 'wb') as file:
    pickle.dump(cb, file)