In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Wczytanie danych
data = pd.read_csv('transformed_data.csv')
data_class = data.copy()

# Przekształcanie 'LogReturn' na zmienną binarną
data_class['LogReturn'] = (data['LogReturn'] > 0).astype(int)

# Konwersja kolumny 'date' na daty
data_class['date'] = pd.to_datetime(data_class['date'])

# Ograniczenie danych do lat < 2019
data_class = data_class.loc[data_class['date'].dt.year < 2019]

# Przygotowanie zmiennych X i Y
X = data_class.drop(['LogReturn', 'date'], axis=1)
Y = data_class['LogReturn']

# Podział na zestaw treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Inicjalizacja modelu CatBoost
model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.1, 
                           depth=6, 
                           verbose=200)

# Trenowanie modelu
model.fit(X_train, y_train)

# Predykcja
y_pred = model.predict(X_test)

# Ocena dokładności
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


0:	learn: 0.6510616	total: 49.2ms	remaining: 49.2s
200:	learn: 0.1153915	total: 598ms	remaining: 2.38s
400:	learn: 0.0404678	total: 1.16s	remaining: 1.73s
600:	learn: 0.0200569	total: 1.7s	remaining: 1.13s
800:	learn: 0.0118731	total: 2.22s	remaining: 551ms
999:	learn: 0.0084850	total: 2.71s	remaining: 0us
Accuracy: 80.50%


In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Wczytanie danych
data = pd.read_csv('transformed_data.csv')
data_class = data.copy()

# Obliczanie kwartylów
q1 = data['LogReturn'].quantile(0.25)  # 25. percentyl
q2 = data['LogReturn'].quantile(0.50)  # 50. percentyl (mediana)
q3 = data['LogReturn'].quantile(0.75)  # 75. percentyl

# Przekształcanie 'LogReturn' na 4 klasy na podstawie kwartylów
data_class['LogReturn'] = pd.cut(data['LogReturn'], bins=[-float('inf'), q1, q2, q3, float('inf')],
                                  labels=[0, 1, 2, 3])

# Konwersja kolumny 'date' na daty
data_class['date'] = pd.to_datetime(data_class['date'])

# Ograniczenie danych do lat < 2019
data_class = data_class.loc[data_class['date'].dt.year < 2019]

# Przygotowanie zmiennych X i Y
X = data_class.drop(['LogReturn', 'date'], axis=1)
Y = data_class['LogReturn']

# Podział na zestaw treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Inicjalizacja modelu CatBoost
model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.1, 
                           depth=6, 
                           verbose=200)

# Trenowanie modelu
model.fit(X_train, y_train)

# Predykcja
y_pred = model.predict(X_test)

# Ocena dokładności
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


0:	learn: 1.3359525	total: 10.6ms	remaining: 10.6s
200:	learn: 0.5210547	total: 1.36s	remaining: 5.4s
400:	learn: 0.3041352	total: 2.67s	remaining: 3.99s
600:	learn: 0.1943586	total: 4.06s	remaining: 2.69s
800:	learn: 0.1324197	total: 5.41s	remaining: 1.34s
999:	learn: 0.0968196	total: 6.76s	remaining: 0us
Accuracy: 59.41%
