###  Многоклассовая классификация. обучение на куске данных их dataset_labelencoder.csv
### Резюме

In [1]:
import pandas as pd
from catboost import Pool, CatBoostClassifier

from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
def fit_catboost(X_train, X_test, y_train, y_test, catboost_params = {}, 
                 verbose = 100):
    learn_pool = Pool(
        X_train, 
        y_train, 
        text_features=["text"], 
        feature_names=["text"]
    )
    test_pool = Pool(
        X_test, 
        y_test, 
        text_features=["text"],
        feature_names=["text"]
    )
    catboost_default_params = {
        'iterations': 15000,
        'learning_rate': 0.015,
        'eval_metric': 'F1',
        'task_type': 'GPU',
        'use_best_model': True,
        # "l2_leaf_reg": 515,
        # "random_strength":15,        
        "early_stopping_rounds":2000        
    }
    catboost_default_params.update(catboost_params)
    
    model = CatBoostClassifier(**catboost_default_params)
    model.fit(learn_pool, eval_set=test_pool, verbose=verbose)
    return model



In [3]:
# label_0
path_data = "/home/ruslan515/PycharmProjects/bft/mod_ml/data/dataset_labelencoder_cv_label_"
path_model= "/home/ruslan515/PycharmProjects/bft/mod_ml/model/ml_model//"
i = 0
label = "label_{}".format(i)    
print(f"train {label}")
file_data = "dataset_labelencoder_cv_label_0_DATA.csv".format(i)
file_data = os.path.join(path_data, file_data)
df_train = pd.read_csv(file_data, sep = ";", usecols= ["text", label],
                      nrows = 6* 10**5)    

train label_0


In [4]:
df_train.label_0.value_counts()

0    350084
1    249916
Name: label_0, dtype: int64

In [5]:
df_train.shape

(600000, 2)

In [6]:
df_train.head()

Unnamed: 0,label_0,text
0,1,"Дополнительные навыки: Добрая, отзывчивая, без..."
1,1,"Дополнительные навыки: стрессо устойчивость, у..."
2,1,"Дополнительные навыки: данные отсутствуют, Тип..."
3,1,"Дополнительные навыки: данные отсутствуют, Тип..."
4,1,"Дополнительные навыки: данные отсутствуют, Тип..."


In [7]:
df_train.label_0.value_counts()

0    350084
1    249916
Name: label_0, dtype: int64

In [8]:

X, X_valid, y, y_valid = train_test_split(df_train[["text"]],
                                          df_train[label],
                                          test_size=0.15,
                                          stratify=df_train[label],
                                          random_state=42)


In [9]:
X.shape

(510000, 1)

In [10]:
X_valid.shape

(90000, 1)

In [11]:
X.head()

Unnamed: 0,text
254148,"Дополнительные навыки: данные отсутствуют, Тип..."
129739,"Дополнительные навыки: Активна, позитивна и жи..."
368978,"Дополнительные навыки: данные отсутствуют, Тип..."
114063,"Дополнительные навыки: Легко обучаемость, вним..."
286514,"Дополнительные навыки: данные отсутствуют, Тип..."


In [12]:
y.shape

(510000,)

In [13]:
y.value_counts()

0    297571
1    212429
Name: label_0, dtype: int64

In [14]:
y_valid.value_counts()

0    52513
1    37487
Name: label_0, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=52)

In [16]:
X_train.shape

(408000, 1)

In [17]:
y_train.shape

(408000,)

In [18]:
y_train.value_counts()

0    238057
1    169943
Name: label_0, dtype: int64

In [19]:
y_test.value_counts()

0    59514
1    42486
Name: label_0, dtype: int64

In [20]:
cat_boost_model = fit_catboost(X_train, X_test, y_train, y_test)
y_pred = cat_boost_model.predict(X_valid)

0:	learn: 0.5838524	test: 0.5835841	best: 0.5835841 (0)	total: 25.2ms	remaining: 6m 18s
100:	learn: 0.6117848	test: 0.6115814	best: 0.6144102 (88)	total: 1.83s	remaining: 4m 29s
200:	learn: 0.6275541	test: 0.6275686	best: 0.6275686 (200)	total: 3.47s	remaining: 4m 15s
300:	learn: 0.6462844	test: 0.6473510	best: 0.6473510 (300)	total: 5.02s	remaining: 4m 4s
400:	learn: 0.6643174	test: 0.6649710	best: 0.6649710 (400)	total: 6.58s	remaining: 3m 59s
500:	learn: 0.6727954	test: 0.6739106	best: 0.6739106 (500)	total: 8.13s	remaining: 3m 55s
600:	learn: 0.6803363	test: 0.6813901	best: 0.6813901 (600)	total: 9.69s	remaining: 3m 52s
700:	learn: 0.6858617	test: 0.6873970	best: 0.6873970 (700)	total: 11.3s	remaining: 3m 49s
800:	learn: 0.6909429	test: 0.6918794	best: 0.6918794 (800)	total: 12.8s	remaining: 3m 47s
900:	learn: 0.6958504	test: 0.6959445	best: 0.6959815 (899)	total: 14.5s	remaining: 3m 46s
1000:	learn: 0.6997891	test: 0.6996576	best: 0.6996899 (998)	total: 16.1s	remaining: 3m 44s
110

8900:	learn: 0.7538169	test: 0.7422472	best: 0.7423166 (8893)	total: 2m 22s	remaining: 1m 37s
9000:	learn: 0.7540869	test: 0.7423352	best: 0.7424224 (8961)	total: 2m 23s	remaining: 1m 35s
9100:	learn: 0.7544135	test: 0.7422115	best: 0.7424224 (8961)	total: 2m 25s	remaining: 1m 34s
9200:	learn: 0.7546081	test: 0.7424100	best: 0.7424224 (8961)	total: 2m 27s	remaining: 1m 32s
9300:	learn: 0.7548382	test: 0.7424980	best: 0.7425248 (9278)	total: 2m 28s	remaining: 1m 31s
9400:	learn: 0.7550149	test: 0.7426958	best: 0.7426958 (9400)	total: 2m 30s	remaining: 1m 29s
9500:	learn: 0.7552902	test: 0.7428942	best: 0.7429299 (9485)	total: 2m 31s	remaining: 1m 27s
9600:	learn: 0.7554733	test: 0.7432101	best: 0.7432252 (9599)	total: 2m 33s	remaining: 1m 26s
9700:	learn: 0.7557716	test: 0.7432684	best: 0.7432890 (9678)	total: 2m 35s	remaining: 1m 24s
9800:	learn: 0.7559950	test: 0.7433528	best: 0.7434215 (9765)	total: 2m 36s	remaining: 1m 23s
9900:	learn: 0.7562940	test: 0.7433288	best: 0.7434215 (9765

In [21]:

print("===============================================\n")
print("\t\t\tf1 = ", f1_score(y_valid, y_pred))
print("===============================================\n")


			f1 =  0.7511349117894909



In [22]:
file_model = os.path.join(path_model, "model_label_cv_data_{}.dump".format(i))
file_model

'/home/ruslan515/PycharmProjects/bft/mod_ml/model/ml_model//model_label_cv_data_0.dump'

In [23]:
cat_boost_model.save_model(file_model)

In [23]:
df_train.head()

Unnamed: 0,text,label_0
0,"Ученая степень: отсутствует, Дополнительные на...",1
1,"Ученая степень: отсутствует, Дополнительные на...",1
2,"Ученая степень: отсутствует, Дополнительные на...",1
3,"Ученая степень: отсутствует, Дополнительные на...",1
4,"Ученая степень: отсутствует, Дополнительные на...",1


In [24]:
df_train = pd.read_csv(file_data, sep = ";")    

In [25]:
df_train.head()

Unnamed: 0,id_candidate,id,text,label_0
0,9f701120-cb52-11ea-959f-7bf9d8e248ac,b3cd0b90-d59e-11ea-8e1c-736ab11edb0c,"Ученая степень: отсутствует, Дополнительные на...",1
1,dd379f80-6a53-11e8-9d34-e37b4be0b9ed,821380f0-6a59-11e8-9d34-e37b4be0b9ed,"Ученая степень: отсутствует, Дополнительные на...",1
2,a4588800-aa5c-11ea-902b-1fdc17069750,c78e3e90-aa5d-11ea-80fb-69632329477c,"Ученая степень: отсутствует, Дополнительные на...",1
3,a4588800-aa5c-11ea-902b-1fdc17069750,17415060-c004-11ea-935a-e37b4be0b9ed,"Ученая степень: отсутствует, Дополнительные на...",1
4,56b814a0-83c2-11ea-9710-ab5d2eb93a75,1af869a0-8427-11ea-89b0-037acc02728d,"Ученая степень: отсутствует, Дополнительные на...",1


In [26]:
df_train[df_train.id_candidate == "469b0700-448b-11eb-83b1-25bddbcbae91"]

Unnamed: 0,id_candidate,id,text,label_0
250000,469b0700-448b-11eb-83b1-25bddbcbae91,d1380db0-451a-11eb-b3fe-ef76bd2a03c1,"Ученая степень: отсутствует, Дополнительные на...",0


In [27]:
X_validd = pd.DataFrame({"text": [df_train.loc[250000, "text"]]})
X_validd

Unnamed: 0,text
0,"Ученая степень: отсутствует, Дополнительные на..."


In [29]:
y_valid

481100    0
454470    0
453502    0
336796    0
459926    0
         ..
130802    1
476762    0
328467    0
111447    1
432642    0
Name: label_0, Length: 75000, dtype: int64

In [30]:
yy = cat_boost_model.predict(X_validd)
yy

array([0])

In [31]:
z = cat_boost_model.predict_proba(X_validd)
z

array([[0.94674333, 0.05325667]])

In [33]:
z[0,y_valid]

array([0.94674333, 0.94674333, 0.94674333, ..., 0.94674333, 0.05325667,
       0.94674333])

In [5]:
# label_1 - label_4
path_data = "/home/ruslan515/PycharmProjects/bft/mod_ml/data/dataset_labelencoder_cv_label_"
path_model= "/home/ruslan515/PycharmProjects/bft/mod_ml/model/catboost_model/"
for i in range(1, 5):
    label = "label_{}".format(i)    
    print(f"train {label}")
    file_data = "dataset_labelencoder_cv_label_{}.csv".format(i)
    file_data = os.path.join(path_data, file_data)
    df_train = pd.read_csv(file_data, sep = ";", usecols= ["text", label])    
    
    X, X_valid, y, y_valid = train_test_split(df_train[["text"]],
                                              df_train[label],
                                              test_size=0.15,
                                              stratify=df_train[label],
                                              random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=52)
    cat_boost_model = fit_catboost(X_train, X_test, y_train, y_test)
    y_pred = cat_boost_model.predict(X_valid)
    print("===============================================\n")
    print("\t\t\tf1 = ", f1_score(y_valid, y_pred))
    print("===============================================\n")
    file_model = os.path.join(path_model, "model_label_cv_{}.dump".format(i))
    cat_boost_model.save_model(file_model)

train label_1
0:	learn: 0.7550919	test: 0.7624649	best: 0.7624649 (0)	total: 21.9ms	remaining: 5m 28s
100:	learn: 0.7604184	test: 0.7687228	best: 0.7687228 (100)	total: 1.78s	remaining: 4m 22s
200:	learn: 0.7693927	test: 0.7774066	best: 0.7774066 (200)	total: 3.41s	remaining: 4m 11s
300:	learn: 0.7750522	test: 0.7835091	best: 0.7835091 (300)	total: 4.94s	remaining: 4m 1s
400:	learn: 0.7797606	test: 0.7876200	best: 0.7876200 (400)	total: 6.41s	remaining: 3m 53s
500:	learn: 0.7840687	test: 0.7909409	best: 0.7909409 (500)	total: 7.89s	remaining: 3m 48s
600:	learn: 0.7872938	test: 0.7940502	best: 0.7941070 (599)	total: 9.36s	remaining: 3m 44s
700:	learn: 0.7894520	test: 0.7964824	best: 0.7965767 (699)	total: 10.9s	remaining: 3m 41s
800:	learn: 0.7915095	test: 0.7984978	best: 0.7984978 (799)	total: 12.3s	remaining: 3m 38s
900:	learn: 0.7932846	test: 0.8003439	best: 0.8003439 (900)	total: 13.8s	remaining: 3m 35s
1000:	learn: 0.7951483	test: 0.8018304	best: 0.8018678 (999)	total: 15.3s	remain

8900:	learn: 0.8256148	test: 0.8237874	best: 0.8238307 (8897)	total: 2m 10s	remaining: 1m 29s
9000:	learn: 0.8257316	test: 0.8237802	best: 0.8238307 (8897)	total: 2m 12s	remaining: 1m 28s
9100:	learn: 0.8259079	test: 0.8238631	best: 0.8238703 (9085)	total: 2m 13s	remaining: 1m 26s
9200:	learn: 0.8260659	test: 0.8238270	best: 0.8239371 (9155)	total: 2m 14s	remaining: 1m 25s
9300:	learn: 0.8262087	test: 0.8239371	best: 0.8239551 (9277)	total: 2m 16s	remaining: 1m 23s
9400:	learn: 0.8263966	test: 0.8240435	best: 0.8240724 (9311)	total: 2m 18s	remaining: 1m 22s
9500:	learn: 0.8265159	test: 0.8240850	best: 0.8241084 (9499)	total: 2m 20s	remaining: 1m 21s
9600:	learn: 0.8266632	test: 0.8241789	best: 0.8242294 (9546)	total: 2m 21s	remaining: 1m 19s
9700:	learn: 0.8268295	test: 0.8241662	best: 0.8242294 (9546)	total: 2m 23s	remaining: 1m 18s
9800:	learn: 0.8269363	test: 0.8242203	best: 0.8243069 (9766)	total: 2m 24s	remaining: 1m 16s
9900:	learn: 0.8271345	test: 0.8242257	best: 0.8243069 (9766

2300:	learn: 0.6145237	test: 0.6227151	best: 0.6228686 (2291)	total: 26.7s	remaining: 2m 27s
2400:	learn: 0.6154086	test: 0.6229947	best: 0.6231480 (2398)	total: 27.8s	remaining: 2m 26s
2500:	learn: 0.6158757	test: 0.6239911	best: 0.6241024 (2492)	total: 29s	remaining: 2m 24s
2600:	learn: 0.6167156	test: 0.6251529	best: 0.6252293 (2599)	total: 30.1s	remaining: 2m 23s
2700:	learn: 0.6172762	test: 0.6248818	best: 0.6253058 (2621)	total: 31.2s	remaining: 2m 21s
2800:	learn: 0.6181664	test: 0.6250834	best: 0.6253058 (2621)	total: 32.9s	remaining: 2m 23s
2900:	learn: 0.6191461	test: 0.6260483	best: 0.6260899 (2895)	total: 34.6s	remaining: 2m 24s
3000:	learn: 0.6198635	test: 0.6268110	best: 0.6268806 (2989)	total: 36.7s	remaining: 2m 26s
3100:	learn: 0.6207365	test: 0.6271440	best: 0.6272964 (3088)	total: 38.5s	remaining: 2m 27s
3200:	learn: 0.6213680	test: 0.6280579	best: 0.6280579 (3199)	total: 39.6s	remaining: 2m 25s
3300:	learn: 0.6226044	test: 0.6280992	best: 0.6283907 (3240)	total: 40.

11100:	learn: 0.6593961	test: 0.6457571	best: 0.6458595 (11012)	total: 2m 8s	remaining: 45.3s
11200:	learn: 0.6597950	test: 0.6460530	best: 0.6460883 (11157)	total: 2m 10s	remaining: 44.1s
11300:	learn: 0.6601565	test: 0.6463808	best: 0.6463808 (11295)	total: 2m 11s	remaining: 43s
11400:	learn: 0.6608705	test: 0.6466059	best: 0.6466379 (11391)	total: 2m 12s	remaining: 41.8s
11500:	learn: 0.6612499	test: 0.6466732	best: 0.6466765 (11410)	total: 2m 13s	remaining: 40.6s
11600:	learn: 0.6615361	test: 0.6470395	best: 0.6471841 (11567)	total: 2m 14s	remaining: 39.4s
11700:	learn: 0.6620264	test: 0.6474797	best: 0.6476627 (11679)	total: 2m 15s	remaining: 38.3s
11800:	learn: 0.6623964	test: 0.6475566	best: 0.6476658 (11773)	total: 2m 17s	remaining: 37.1s
11900:	learn: 0.6626364	test: 0.6474089	best: 0.6477043 (11805)	total: 2m 18s	remaining: 36s
12000:	learn: 0.6629599	test: 0.6474828	best: 0.6477043 (11805)	total: 2m 19s	remaining: 34.8s
12100:	learn: 0.6632185	test: 0.6475566	best: 0.6477043

4600:	learn: 0.6484473	test: 0.6424336	best: 0.6424336 (4598)	total: 57.1s	remaining: 2m 8s
4700:	learn: 0.6489650	test: 0.6426022	best: 0.6427884 (4681)	total: 58.3s	remaining: 2m 7s
4800:	learn: 0.6494562	test: 0.6434113	best: 0.6434291 (4789)	total: 59.4s	remaining: 2m 6s
4900:	learn: 0.6499849	test: 0.6437281	best: 0.6438543 (4885)	total: 1m	remaining: 2m 4s
5000:	learn: 0.6505530	test: 0.6443831	best: 0.6443831 (4999)	total: 1m 1s	remaining: 2m 3s
5100:	learn: 0.6510057	test: 0.6448033	best: 0.6448453 (5095)	total: 1m 3s	remaining: 2m 2s
5200:	learn: 0.6515855	test: 0.6451793	best: 0.6452233 (5181)	total: 1m 4s	remaining: 2m
5300:	learn: 0.6520336	test: 0.6452712	best: 0.6454751 (5223)	total: 1m 5s	remaining: 1m 59s
5400:	learn: 0.6523989	test: 0.6455410	best: 0.6456907 (5383)	total: 1m 6s	remaining: 1m 58s
5500:	learn: 0.6530153	test: 0.6460900	best: 0.6461319 (5496)	total: 1m 7s	remaining: 1m 57s
5600:	learn: 0.6534362	test: 0.6464271	best: 0.6465691 (5576)	total: 1m 8s	remainin

13400:	learn: 0.6812502	test: 0.6602926	best: 0.6605577 (13343)	total: 2m 40s	remaining: 19.2s
13500:	learn: 0.6814634	test: 0.6604361	best: 0.6606205 (13459)	total: 2m 42s	remaining: 18s
13600:	learn: 0.6818203	test: 0.6604378	best: 0.6606205 (13459)	total: 2m 43s	remaining: 16.8s
13700:	learn: 0.6820597	test: 0.6605992	best: 0.6606205 (13459)	total: 2m 44s	remaining: 15.6s
13800:	learn: 0.6823817	test: 0.6605796	best: 0.6606619 (13782)	total: 2m 45s	remaining: 14.4s
13900:	learn: 0.6826904	test: 0.6606227	best: 0.6608059 (13868)	total: 2m 46s	remaining: 13.2s
14000:	learn: 0.6830936	test: 0.6606843	best: 0.6608059 (13868)	total: 2m 48s	remaining: 12s
14100:	learn: 0.6834178	test: 0.6607671	best: 0.6608276 (14059)	total: 2m 49s	remaining: 10.8s
14200:	learn: 0.6835343	test: 0.6606467	best: 0.6608276 (14059)	total: 2m 50s	remaining: 9.59s
14300:	learn: 0.6838488	test: 0.6606266	best: 0.6608276 (14059)	total: 2m 51s	remaining: 8.39s
14400:	learn: 0.6840387	test: 0.6606064	best: 0.660827

6900:	learn: 0.6386548	test: 0.6390628	best: 0.6394179 (6717)	total: 1m 19s	remaining: 1m 33s
7000:	learn: 0.6388969	test: 0.6391488	best: 0.6394179 (6717)	total: 1m 20s	remaining: 1m 32s
7100:	learn: 0.6394620	test: 0.6395510	best: 0.6395510 (7098)	total: 1m 21s	remaining: 1m 31s
7200:	learn: 0.6398962	test: 0.6398418	best: 0.6399145 (7196)	total: 1m 22s	remaining: 1m 29s
7300:	learn: 0.6401701	test: 0.6399487	best: 0.6400214 (7279)	total: 1m 24s	remaining: 1m 28s
7400:	learn: 0.6405698	test: 0.6404230	best: 0.6404230 (7396)	total: 1m 25s	remaining: 1m 27s
7500:	learn: 0.6410653	test: 0.6402993	best: 0.6404230 (7396)	total: 1m 26s	remaining: 1m 26s
7600:	learn: 0.6413808	test: 0.6400171	best: 0.6404230 (7396)	total: 1m 27s	remaining: 1m 25s
7700:	learn: 0.6417103	test: 0.6400983	best: 0.6404230 (7396)	total: 1m 28s	remaining: 1m 24s
7800:	learn: 0.6421598	test: 0.6400641	best: 0.6404230 (7396)	total: 1m 30s	remaining: 1m 23s
7900:	learn: 0.6426346	test: 0.6399615	best: 0.6404230 (7396

In [3]:
# label_5
path_data = "/home/ruslan515/PycharmProjects/bft/mod_ml/data/dataset_labelencoder_cv_label_"
path_model= "/home/ruslan515/PycharmProjects/bft/mod_ml/model/catboost_model/"
for i in range(5, 6):
    label = "label_{}".format(i)    
    print(f"train {label}")
    file_data = "dataset_labelencoder_cv_label_{}.csv".format(i)
    file_data = os.path.join(path_data, file_data)
    df_train = pd.read_csv(file_data, sep = ";", usecols= ["text", label])    
    
    X, X_valid, y, y_valid = train_test_split(df_train[["text"]],
                                              df_train[label],
                                              test_size=0.15,
                                              stratify=df_train[label],
                                              random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=52)
    cat_boost_model = fit_catboost(X_train, X_test, y_train, y_test)
    y_pred = cat_boost_model.predict(X_valid)
    print("===============================================\n")
    print("\t\t\tf1 = ", f1_score(y_valid, y_pred))
    print("===============================================\n")
    file_model = os.path.join(path_model, "model_label_cv_{}.dump".format(i))

    cat_boost_model.save_model(file_model)

train label_5
0:	learn: 0.7337862	test: 0.7366171	best: 0.7366171 (0)	total: 27.1ms	remaining: 6m 47s
100:	learn: 0.7541456	test: 0.7571228	best: 0.7571228 (100)	total: 1.89s	remaining: 4m 38s
200:	learn: 0.7655334	test: 0.7683086	best: 0.7683086 (200)	total: 3.61s	remaining: 4m 26s
300:	learn: 0.7721874	test: 0.7752542	best: 0.7752857 (299)	total: 5.23s	remaining: 4m 15s
400:	learn: 0.7768399	test: 0.7800909	best: 0.7800909 (400)	total: 6.81s	remaining: 4m 7s
500:	learn: 0.7803565	test: 0.7840080	best: 0.7840080 (500)	total: 8.37s	remaining: 4m 2s
600:	learn: 0.7831121	test: 0.7868951	best: 0.7868951 (600)	total: 9.9s	remaining: 3m 57s
700:	learn: 0.7853363	test: 0.7891970	best: 0.7891970 (700)	total: 11.5s	remaining: 3m 53s
800:	learn: 0.7873260	test: 0.7911372	best: 0.7911372 (800)	total: 13s	remaining: 3m 49s
900:	learn: 0.7889884	test: 0.7927440	best: 0.7927440 (900)	total: 14.5s	remaining: 3m 46s
1000:	learn: 0.7908111	test: 0.7946392	best: 0.7946392 (1000)	total: 16s	remaining: 

8900:	learn: 0.8262930	test: 0.8242196	best: 0.8242707 (8870)	total: 2m 18s	remaining: 1m 34s
9000:	learn: 0.8265157	test: 0.8245527	best: 0.8245646 (8999)	total: 2m 20s	remaining: 1m 33s
9100:	learn: 0.8267039	test: 0.8246468	best: 0.8247133 (9060)	total: 2m 21s	remaining: 1m 31s
9200:	learn: 0.8269138	test: 0.8246801	best: 0.8247133 (9060)	total: 2m 23s	remaining: 1m 30s
9300:	learn: 0.8271260	test: 0.8247026	best: 0.8247527 (9218)	total: 2m 24s	remaining: 1m 28s
9400:	learn: 0.8273454	test: 0.8249239	best: 0.8249406 (9398)	total: 2m 26s	remaining: 1m 27s
9500:	learn: 0.8274999	test: 0.8249381	best: 0.8250048 (9476)	total: 2m 28s	remaining: 1m 25s
9600:	learn: 0.8276825	test: 0.8250498	best: 0.8250581 (9599)	total: 2m 29s	remaining: 1m 24s
9700:	learn: 0.8279347	test: 0.8250642	best: 0.8251044 (9635)	total: 2m 31s	remaining: 1m 22s
9800:	learn: 0.8281063	test: 0.8252283	best: 0.8252485 (9792)	total: 2m 33s	remaining: 1m 21s
9900:	learn: 0.8282517	test: 0.8253352	best: 0.8253708 (9888

In [3]:
# label_6 - label_8
path_data = "/home/ruslan515/PycharmProjects/bft/mod_ml/data/dataset_labelencoder_cv_label_"
path_model= "/home/ruslan515/PycharmProjects/bft/mod_ml/model/catboost_model/"
for i in range(6, 9):
    label = "label_{}".format(i)    
    print(f"train {label}")
    file_data = "dataset_labelencoder_cv_label_{}.csv".format(i)
    file_data = os.path.join(path_data, file_data)
    df_train = pd.read_csv(file_data, sep = ";", usecols= ["text", label])    
    
    X, X_valid, y, y_valid = train_test_split(df_train[["text"]],
                                              df_train[label],
                                              test_size=0.15,
                                              stratify=df_train[label],
                                              random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=52)
    cat_boost_model = fit_catboost(X_train, X_test, y_train, y_test)
    y_pred = cat_boost_model.predict(X_valid)
    print("===============================================\n")
    print("\t\t\tf1 = ", f1_score(y_valid, y_pred))
    print("===============================================\n")
    file_model = os.path.join(path_model, "model_label_cv_{}.dump".format(i))
    cat_boost_model.save_model(file_model)

train label_6
0:	learn: 0.7031108	test: 0.6996953	best: 0.6996953 (0)	total: 37.3ms	remaining: 9m 18s
100:	learn: 0.7087838	test: 0.7065319	best: 0.7067035 (98)	total: 1.88s	remaining: 4m 37s
200:	learn: 0.7173289	test: 0.7161397	best: 0.7161397 (200)	total: 3.54s	remaining: 4m 20s
300:	learn: 0.7243652	test: 0.7225138	best: 0.7225138 (300)	total: 5.12s	remaining: 4m 9s
400:	learn: 0.7303501	test: 0.7281981	best: 0.7281981 (400)	total: 6.66s	remaining: 4m 2s
500:	learn: 0.7341887	test: 0.7322604	best: 0.7322604 (500)	total: 8.15s	remaining: 3m 55s
600:	learn: 0.7373728	test: 0.7356533	best: 0.7356533 (600)	total: 9.76s	remaining: 3m 53s
700:	learn: 0.7401443	test: 0.7387725	best: 0.7388787 (698)	total: 11.2s	remaining: 3m 48s
800:	learn: 0.7421099	test: 0.7413103	best: 0.7413505 (799)	total: 12.8s	remaining: 3m 46s
900:	learn: 0.7440445	test: 0.7430327	best: 0.7430463 (898)	total: 14.2s	remaining: 3m 42s
1000:	learn: 0.7455587	test: 0.7445551	best: 0.7446178 (998)	total: 15.6s	remainin

8900:	learn: 0.7788693	test: 0.7687863	best: 0.7689119 (8864)	total: 2m 7s	remaining: 1m 27s
9000:	learn: 0.7790648	test: 0.7689298	best: 0.7689358 (8975)	total: 2m 9s	remaining: 1m 26s
9100:	learn: 0.7792669	test: 0.7688959	best: 0.7689976 (9032)	total: 2m 10s	remaining: 1m 24s
9200:	learn: 0.7794602	test: 0.7689537	best: 0.7690335 (9155)	total: 2m 12s	remaining: 1m 23s
9300:	learn: 0.7796691	test: 0.7691371	best: 0.7692427 (9276)	total: 2m 13s	remaining: 1m 21s
9400:	learn: 0.7798419	test: 0.7692009	best: 0.7692427 (9276)	total: 2m 15s	remaining: 1m 20s
9500:	learn: 0.7799912	test: 0.7693284	best: 0.7693882 (9448)	total: 2m 16s	remaining: 1m 18s
9600:	learn: 0.7802222	test: 0.7694958	best: 0.7695496 (9589)	total: 2m 17s	remaining: 1m 17s
9700:	learn: 0.7804837	test: 0.7697010	best: 0.7697010 (9700)	total: 2m 19s	remaining: 1m 16s
9800:	learn: 0.7806904	test: 0.7696930	best: 0.7697787 (9713)	total: 2m 20s	remaining: 1m 14s
9900:	learn: 0.7808418	test: 0.7697507	best: 0.7697886 (9885)	

2400:	learn: 0.7911983	test: 0.7965271	best: 0.7966334 (2384)	total: 31.4s	remaining: 2m 44s
2500:	learn: 0.7916408	test: 0.7967649	best: 0.7967888 (2427)	total: 32.7s	remaining: 2m 43s
2600:	learn: 0.7921655	test: 0.7972802	best: 0.7973376 (2589)	total: 34s	remaining: 2m 41s
2700:	learn: 0.7926508	test: 0.7977136	best: 0.7977301 (2643)	total: 35.2s	remaining: 2m 40s
2800:	learn: 0.7931350	test: 0.7977957	best: 0.7978367 (2776)	total: 36.5s	remaining: 2m 38s
2900:	learn: 0.7935027	test: 0.7980655	best: 0.7981880 (2875)	total: 37.8s	remaining: 2m 37s
3000:	learn: 0.7938153	test: 0.7984174	best: 0.7984499 (2998)	total: 39s	remaining: 2m 36s
3100:	learn: 0.7942894	test: 0.7986216	best: 0.7987766 (3073)	total: 40.3s	remaining: 2m 34s
3200:	learn: 0.7947450	test: 0.7992254	best: 0.7992335 (3173)	total: 41.5s	remaining: 2m 33s
3300:	learn: 0.7951185	test: 0.7995435	best: 0.7995680 (3299)	total: 42.8s	remaining: 2m 31s
3400:	learn: 0.7953510	test: 0.7997881	best: 0.7998615 (3368)	total: 44.1s

11200:	learn: 0.8118115	test: 0.8068196	best: 0.8068437 (11190)	total: 2m 22s	remaining: 48.2s
11300:	learn: 0.8119739	test: 0.8067876	best: 0.8069005 (11213)	total: 2m 23s	remaining: 47s
11400:	learn: 0.8122142	test: 0.8070459	best: 0.8070537 (11389)	total: 2m 24s	remaining: 45.7s
11500:	learn: 0.8123531	test: 0.8070381	best: 0.8070537 (11389)	total: 2m 26s	remaining: 44.4s
11600:	learn: 0.8125203	test: 0.8071353	best: 0.8072326 (11594)	total: 2m 27s	remaining: 43.2s
11700:	learn: 0.8126326	test: 0.8072404	best: 0.8072482 (11690)	total: 2m 28s	remaining: 41.9s
11800:	learn: 0.8128225	test: 0.8072886	best: 0.8073610 (11730)	total: 2m 29s	remaining: 40.6s
11900:	learn: 0.8129171	test: 0.8074170	best: 0.8074170 (11900)	total: 2m 31s	remaining: 39.3s
12000:	learn: 0.8130939	test: 0.8073354	best: 0.8074326 (11958)	total: 2m 32s	remaining: 38.1s
12100:	learn: 0.8132891	test: 0.8073673	best: 0.8074886 (12055)	total: 2m 33s	remaining: 36.8s
12200:	learn: 0.8134336	test: 0.8073992	best: 0.8074

4700:	learn: 0.6319256	test: 0.6420393	best: 0.6421087 (4679)	total: 52.3s	remaining: 1m 54s
4800:	learn: 0.6325624	test: 0.6424485	best: 0.6428803 (4770)	total: 53.4s	remaining: 1m 53s
4900:	learn: 0.6329865	test: 0.6425525	best: 0.6428803 (4770)	total: 54.5s	remaining: 1m 52s
5000:	learn: 0.6335325	test: 0.6428147	best: 0.6429227 (4988)	total: 55.6s	remaining: 1m 51s
5100:	learn: 0.6341286	test: 0.6430074	best: 0.6432616 (5048)	total: 56.7s	remaining: 1m 49s
5200:	learn: 0.6346697	test: 0.6429727	best: 0.6433348 (5136)	total: 57.8s	remaining: 1m 48s
5300:	learn: 0.6350436	test: 0.6433732	best: 0.6435542 (5297)	total: 58.8s	remaining: 1m 47s
5400:	learn: 0.6356518	test: 0.6429034	best: 0.6435542 (5297)	total: 59.9s	remaining: 1m 46s
5500:	learn: 0.6360294	test: 0.6430112	best: 0.6435542 (5297)	total: 1m	remaining: 1m 45s
5600:	learn: 0.6365987	test: 0.6434079	best: 0.6435542 (5297)	total: 1m 2s	remaining: 1m 44s
5700:	learn: 0.6370813	test: 0.6433808	best: 0.6435542 (5297)	total: 1m 3

13500:	learn: 0.6661072	test: 0.6542901	best: 0.6545648 (13354)	total: 2m 29s	remaining: 16.6s
13600:	learn: 0.6663549	test: 0.6543985	best: 0.6545648 (13354)	total: 2m 31s	remaining: 15.5s
13700:	learn: 0.6666942	test: 0.6544681	best: 0.6545648 (13354)	total: 2m 32s	remaining: 14.4s
13800:	learn: 0.6672074	test: 0.6546112	best: 0.6547543 (13755)	total: 2m 33s	remaining: 13.3s
13900:	learn: 0.6675283	test: 0.6545029	best: 0.6548240 (13841)	total: 2m 34s	remaining: 12.2s
14000:	learn: 0.6680229	test: 0.6548974	best: 0.6548974 (13999)	total: 2m 35s	remaining: 11.1s
14100:	learn: 0.6682610	test: 0.6550037	best: 0.6550037 (14090)	total: 2m 36s	remaining: 9.99s
14200:	learn: 0.6685815	test: 0.6552916	best: 0.6553282 (14194)	total: 2m 37s	remaining: 8.89s
14300:	learn: 0.6688742	test: 0.6553997	best: 0.6554693 (14224)	total: 2m 39s	remaining: 7.78s
14400:	learn: 0.6691759	test: 0.6553648	best: 0.6555426 (14324)	total: 2m 40s	remaining: 6.67s
14500:	learn: 0.6694959	test: 0.6552585	best: 0.65