In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# loading the dataset from the csv file
dataset = pd.read_csv("chunks.csv",header=None)
y = dataset.iloc[:,-1]
X = dataset.drop(dataset.columns[[-1]], axis=1)

In [3]:
# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_positive = X_train[y_train>0]
X_train_negative = X_train[y_train==0][:X_train_positive.shape[0]]
y_train_positive = y_train[y_train>0]
y_train_negative = y_train[y_train==0][:X_train_positive.shape[0]]

X_train = np.vstack((X_train_positive,X_train_negative))
y_train = np.hstack((y_train_positive, y_train_negative))

# parameters for the GridSearch
parameters = {'depth'         : [6,7,8,9, 10],
              'learning_rate' : [0.01,0.02,0.03,0.04],
              'iterations'    : [10, 20,30,40,50,60,70,80,90, 100]
              }

catboostclass = CatBoostClassifier()

# training the model
grid_catboost = GridSearchCV(estimator=catboostclass, param_grid = parameters, scoring=accuracy_score)
# Fit model
grid_catboost.fit(X_train, y_train)

print(f"Best score: {grid_catboost.best_score_}")

best_params = grid_catboost.best_params_
print(best_params)

# retrive best model
catboost = grid_catboost.best_estimator_

# Test the model
y_pred = catboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)

print("Precision")
print(precision)

print("Recall")
print(recall)

print(f"Accuracy of chunk selection: {round(accuracy,2)*100}%")



0:	learn: 0.6916051	total: 193ms	remaining: 1.73s
1:	learn: 0.6907099	total: 206ms	remaining: 825ms
2:	learn: 0.6896092	total: 221ms	remaining: 516ms
3:	learn: 0.6882906	total: 236ms	remaining: 353ms
4:	learn: 0.6875912	total: 251ms	remaining: 251ms
5:	learn: 0.6860816	total: 267ms	remaining: 178ms
6:	learn: 0.6849809	total: 281ms	remaining: 121ms
7:	learn: 0.6834732	total: 297ms	remaining: 74.2ms
8:	learn: 0.6823475	total: 310ms	remaining: 34.4ms
9:	learn: 0.6814392	total: 326ms	remaining: 0us


Traceback (most recent call last):
  File "c:\Users\boezi\VisualStudioProjects\PodcastSummarization\env\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
TypeError: accuracy_score() takes 2 positional arguments but 3 were given



0:	learn: 0.6917796	total: 18.4ms	remaining: 166ms
1:	learn: 0.6909318	total: 32.1ms	remaining: 128ms
2:	learn: 0.6899321	total: 51.4ms	remaining: 120ms
3:	learn: 0.6889035	total: 67.2ms	remaining: 101ms
4:	learn: 0.6879413	total: 80.6ms	remaining: 80.6ms
5:	learn: 0.6869963	total: 96.8ms	remaining: 64.5ms
6:	learn: 0.6860589	total: 117ms	remaining: 50ms
7:	learn: 0.6850828	total: 135ms	remaining: 33.7ms
8:	learn: 0.6842531	total: 154ms	remaining: 17.1ms
9:	learn: 0.6832025	total: 170ms	remaining: 0us
0:	learn: 0.6917757	total: 18.9ms	remaining: 170ms
1:	learn: 0.6907182	total: 33.3ms	remaining: 133ms
2:	learn: 0.6895901	total: 48.8ms	remaining: 114ms
3:	learn: 0.6888132	total: 62.6ms	remaining: 93.9ms
4:	learn: 0.6877170	total: 77.4ms	remaining: 77.4ms
5:	learn: 0.6867539	total: 89.3ms	remaining: 59.6ms
6:	learn: 0.6859194	total: 104ms	remaining: 44.8ms
7:	learn: 0.6843589	total: 119ms	remaining: 29.7ms
8:	learn: 0.6833807	total: 133ms	remaining: 14.8ms
9:	learn: 0.6822726	total: 146m

KeyboardInterrupt: 