In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from catboost import Pool
import yaml

In [2]:
SEED: int = 666
TEST_SIZE: float = 0.2
DATA: str = 'data.csv'
ITERATIONS: int = 100
EARLY_STOPPING_ROUNDS: int = 200
L2_LEAF_REG: int = 50
DEPTH: int = 6
LEARNING_RATE: float = 0.05

In [3]:
# Parameters
SEED = 666
DATA = "data.csv"
TEST_SIZE = 0.95
ITERATIONS = 100
LEARNING_RATE = 0.05
L2_LEAF_REG = 50
EARLY_STOPPING_ROUNDS = 200
DEPTH = 6


In [4]:
df = pd.read_csv(DATA, index_col=0)
df

Unnamed: 0,target,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [5]:
target = 'target'
features = list(df.columns)
features.remove(target)
X = df[features]
y = df[target]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.reshape(-1), test_size=TEST_SIZE, random_state=SEED)

train_pool = Pool(
    data = X_train,
    label = y_train,
)

test_pool = Pool(
    data = X_test,
    label = y_test,
)

In [7]:
cb = CatBoostClassifier(
                            iterations=ITERATIONS,
                            loss_function='MultiClass',
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                            l2_leaf_reg=L2_LEAF_REG,
                            depth=DEPTH,
                            learning_rate=LEARNING_RATE
                          )

In [8]:
cb.fit(train_pool,
          eval_set=test_pool,
          verbose=True,
          plot=False
          )

0:	learn: 1.0973211	test: 1.0983629	best: 1.0983629 (0)	total: 143ms	remaining: 14.2s
1:	learn: 1.0961050	test: 1.0977806	best: 1.0977806 (1)	total: 145ms	remaining: 7.08s
2:	learn: 1.0952584	test: 1.0974686	best: 1.0974686 (2)	total: 147ms	remaining: 4.75s
3:	learn: 1.0939730	test: 1.0971795	best: 1.0971795 (3)	total: 147ms	remaining: 3.54s
4:	learn: 1.0928561	test: 1.0968559	best: 1.0968559 (4)	total: 149ms	remaining: 2.82s
5:	learn: 1.0921978	test: 1.0966878	best: 1.0966878 (5)	total: 150ms	remaining: 2.36s
6:	learn: 1.0913549	test: 1.0960306	best: 1.0960306 (6)	total: 151ms	remaining: 2.01s
7:	learn: 1.0905128	test: 1.0954512	best: 1.0954512 (7)	total: 153ms	remaining: 1.76s
8:	learn: 1.0896715	test: 1.0950353	best: 1.0950353 (8)	total: 154ms	remaining: 1.56s
9:	learn: 1.0888309	test: 1.0946309	best: 1.0946309 (9)	total: 156ms	remaining: 1.4s
10:	learn: 1.0876252	test: 1.0942174	best: 1.0942174 (10)	total: 158ms	remaining: 1.27s
11:	learn: 1.0858810	test: 1.0935239	best: 1.0935239 

<catboost.core.CatBoostClassifier at 0x2d754f9d0d0>

In [9]:
preds = cb.predict(X_test)
preds

array([['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['virginica'],
       ['setosa'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor'],
       ['versicolor

In [10]:
y_test

array(['versicolor', 'virginica', 'versicolor', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'setosa',
       'virginica', 'versicolor', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica',
       'virginica', 'setosa', 'setosa', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'setosa',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'versicolor',
       'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'setosa',
       'setosa', 'virginica', 'versicolor', 'versicolor', 'versicolor',
       'setosa', 'setosa', 'virginica', 'virginica', 'versicolor',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'versicolor', 'setosa', 'setosa', 'virginica', 'virginica',
       'virginica', 'versicolor', '

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='micro')
recall = recall_score(y_test, preds, average='micro')
f1 = f1_score(y_test, preds, average='micro')

In [12]:
metrics = {}

metrics['accuracy cb'] = float(accuracy)
metrics['precision cb'] = float(precision)
metrics['recall cb'] = float(recall)
metrics['f1 cb'] = float(f1)

with open('./outputs/metrics_cb.yaml', 'w') as file:
    yaml.dump(metrics, file, default_flow_style=False)

In [13]:
metrics

{'accuracy cb': 0.46153846153846156,
 'precision cb': 0.46153846153846156,
 'recall cb': 0.46153846153846156,
 'f1 cb': 0.46153846153846156}

In [14]:
preds = [i[0] for i in preds]
preds

['versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 've

In [15]:
test_classes = pd.DataFrame()

test_classes['actual_class'] = y_test
test_classes['predicted_class'] = preds
test_classes

Unnamed: 0,actual_class,predicted_class
0,versicolor,versicolor
1,virginica,versicolor
2,versicolor,versicolor
3,virginica,versicolor
4,setosa,versicolor
...,...,...
138,virginica,versicolor
139,versicolor,versicolor
140,setosa,versicolor
141,setosa,setosa


In [16]:
train_classes = pd.DataFrame()

train_classes['actual_class'] = y_train
train_classes['predicted_class'] = [i[0] for i in cb.predict(X_train)]
train_classes

Unnamed: 0,actual_class,predicted_class
0,setosa,setosa
1,versicolor,versicolor
2,versicolor,versicolor
3,versicolor,versicolor
4,versicolor,versicolor
5,versicolor,versicolor
6,virginica,virginica


In [17]:
test_classes.to_csv('outputs/test_classes_cb.csv', index=False)
train_classes.to_csv('outputs/train_classes_cb.csv', index=False)

In [18]:
import pickle

with open('./outputs/model_cb.pckl', 'wb') as file:
    pickle.dump(cb, file)