In [43]:
import pandas as pd 
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier, Pool
from itertools import product

In [49]:
train = pd.read_csv('data/train.csv', usecols=['text', 'sentiment'])
test = pd.read_csv('data/valid.csv', usecols=['text', 'sentiment'])

In [50]:
# Создаем объекты Pool для работы с текстом

train_pool = Pool(data=train['text'], label=train['sentiment'], text_features=[0])
test_pool = Pool(data=test['text'], text_features=[0])

In [52]:
# Словарь для подбора параметров

params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [300, 500, 1000],
    'depth': [5, 7, 9],
    'l2_leaf_reg': [1, 3, 5],
    
}

In [54]:
# Так как GridSearch не поддерживает Pool напрямую, будем подбирать гиперпараметры в цикле 

best_score = 0
best_params = None

for iter, lr, depth, l2 in product(
    params['iterations'],
    params['learning_rate'],
    params['depth'],
    params['l2_leaf_reg']
):
    model = CatBoostClassifier(
        iterations=iter,
        learning_rate=lr,
        depth=depth,
        l2_leaf_reg=l2,
        text_features=[0],
        loss_function='MultiClass',
        verbose=0
    )

    model.fit(train_pool)
    preds = model.predict(test_pool)
    score = f1_score(test['sentiment'], preds, average='weighted')

    print(f'Score={score:.4f} for iterations={iter}, learning rate={lr}, depth={depth}, l2 reg={l2}')
    
    if score > best_score:
        best_score = score
        best_params = {'iterations': iter, 'learning_rate': lr, 'depth': depth, 'l2_reg': l2}

print('Best params:', best_params)

Score=0.7141 for iterations=300, learning rate=0.01, depth=5, l2 reg=1
Score=0.7147 for iterations=300, learning rate=0.01, depth=5, l2 reg=3
Score=0.7144 for iterations=300, learning rate=0.01, depth=5, l2 reg=5
Score=0.7200 for iterations=300, learning rate=0.01, depth=7, l2 reg=1
Score=0.7198 for iterations=300, learning rate=0.01, depth=7, l2 reg=3
Score=0.7203 for iterations=300, learning rate=0.01, depth=7, l2 reg=5
Score=0.7241 for iterations=300, learning rate=0.01, depth=9, l2 reg=1
Score=0.7245 for iterations=300, learning rate=0.01, depth=9, l2 reg=3
Score=0.7241 for iterations=300, learning rate=0.01, depth=9, l2 reg=5
Score=0.7321 for iterations=300, learning rate=0.05, depth=5, l2 reg=1
Score=0.7333 for iterations=300, learning rate=0.05, depth=5, l2 reg=3
Score=0.7325 for iterations=300, learning rate=0.05, depth=5, l2 reg=5
Score=0.7372 for iterations=300, learning rate=0.05, depth=7, l2 reg=1
Score=0.7367 for iterations=300, learning rate=0.05, depth=7, l2 reg=3
Score=

In [55]:
cat_boost_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=9,
        l2_leaf_reg=3,
        text_features=[0],
        loss_function='MultiClass',
        verbose=0)

cat_boost_model.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x29e4699cb00>

In [57]:
pred = cat_boost_model.predict(test_pool)
print(f'f1-score: {f1_score(test['sentiment'], pred, average='weighted')}')

f1-score: 0.7573653851652008


In [58]:
model.save_model("model.cbm")