In [10]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from catboost import CatBoostClassifier, Pool, metrics


# Data preparation

In [5]:
config_file = '../references/config.json'
with open(config_file, "r") as f:
    config = json.load(f)

# Extract values from the configuration file
texts_data_path = config["data_paths"]["texts"]
embeddings_data_path = config["data_paths"]["embeddings"]
full_df_data_path = config["data_paths"]["full_df"]

In [7]:
df = pd.read_csv(full_df_data_path, index_col=0)
features = df.iloc[:, :-1].values
labels = df.iloc[:, -1].values
features.shape, labels.shape

((2237, 1068), (2237,))

# CatBoost pipeline

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [22]:
params = {
    'task_type': 'CPU',
    'n_estimators': 1000,
    'learning_rate': 1e-2,
    'depth': 3,
    'l2_leaf_reg': 5,
    'leaf_estimation_method': 'Newton',

    'eval_metric': metrics.Accuracy(),
    'loss_function': 'MultiClass',
    'random_seed': 42,
    'use_best_model': False
}

train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_test, y_test)

In [23]:
model = CatBoostClassifier(**params)
model.fit(
    train_pool, 
    eval_set=validate_pool,
    verbose=100
    )

0:	learn: 0.5290735	test: 0.5223214	best: 0.5223214 (0)	total: 40.4ms	remaining: 40.3s
100:	learn: 0.6153355	test: 0.5907738	best: 0.5907738 (96)	total: 3.68s	remaining: 32.7s
200:	learn: 0.6402556	test: 0.5967262	best: 0.6026786 (178)	total: 7.38s	remaining: 29.3s
300:	learn: 0.6645367	test: 0.6145833	best: 0.6175595 (286)	total: 11s	remaining: 25.4s
400:	learn: 0.6734824	test: 0.6130952	best: 0.6175595 (286)	total: 14.6s	remaining: 21.8s
500:	learn: 0.6869010	test: 0.6235119	best: 0.6235119 (492)	total: 18.1s	remaining: 18s
600:	learn: 0.6971246	test: 0.6220238	best: 0.6264881 (579)	total: 21.7s	remaining: 14.4s
700:	learn: 0.7099042	test: 0.6264881	best: 0.6309524 (676)	total: 25.2s	remaining: 10.8s
800:	learn: 0.7303514	test: 0.6369048	best: 0.6383929 (789)	total: 29s	remaining: 7.2s
900:	learn: 0.7405751	test: 0.6398810	best: 0.6413690 (878)	total: 32.7s	remaining: 3.59s
999:	learn: 0.7488818	test: 0.6443452	best: 0.6443452 (944)	total: 36.4s	remaining: 0us

bestTest = 0.644345238

<catboost.core.CatBoostClassifier at 0x7faab5c11410>

In [31]:
target_names = ['negative', 'positive', 'neutral']  # name of each class
print('-'*15, 'Train classification_report', '-'*15, '\n')
print(classification_report(y_train, model.predict(X_train), target_names=target_names))
print(' ')
print('-'*15, 'Test classification_report', '-'*15, '\n')
print(classification_report(y_test, model.predict(X_test), target_names=target_names))

--------------- Train classification_report --------------- 

              precision    recall  f1-score   support

    negative       0.74      0.79      0.76       475
    positive       0.74      0.89      0.81       668
     neutral       0.80      0.49      0.61       422

    accuracy                           0.75      1565
   macro avg       0.76      0.72      0.72      1565
weighted avg       0.76      0.75      0.74      1565

 
--------------- Test classification_report --------------- 

              precision    recall  f1-score   support

    negative       0.69      0.67      0.68       219
    positive       0.65      0.82      0.72       278
     neutral       0.54      0.34      0.42       175

    accuracy                           0.64       672
   macro avg       0.63      0.61      0.61       672
weighted avg       0.63      0.64      0.63       672

