In [3]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Wczytanie danych
data = pd.read_csv('transformed_data.csv')
data_class = data.copy()

# Przekształcanie 'LogReturn' na zmienną binarną
data_class['LogReturn'] = (data['LogReturn'] > 0).astype(int)

# Konwersja kolumny 'date' na daty
data_class['date'] = pd.to_datetime(data_class['date'])

# Ograniczenie danych do lat < 2019
data_class = data_class.loc[data_class['date'].dt.year < 2019]

# Przygotowanie zmiennych X i Y
X = data_class.drop(['LogReturn', 'date'], axis=1)
Y = data_class['LogReturn']

# Podział na zestaw treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Inicjalizacja modelu CatBoost
model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.1, 
                           depth=6, 
                           verbose=200)

# Trenowanie modelu
model.fit(X_train, y_train)

# Predykcja
y_pred = model.predict(X_test)

# Ocena dokładności
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


0:	learn: 0.6442913	total: 6.02ms	remaining: 6.02s
200:	learn: 0.1007418	total: 713ms	remaining: 2.83s
400:	learn: 0.0322701	total: 1.39s	remaining: 2.07s
600:	learn: 0.0153415	total: 2.08s	remaining: 1.38s
800:	learn: 0.0090979	total: 2.81s	remaining: 698ms
999:	learn: 0.0073243	total: 3.5s	remaining: 0us
Accuracy: 80.00%


In [4]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Wczytanie danych
data = pd.read_csv('transformed_data.csv')
data_class = data.copy()

# Obliczanie kwartylów
q1 = data['LogReturn'].quantile(0.25)  # 25. percentyl
q2 = data['LogReturn'].quantile(0.50)  # 50. percentyl (mediana)
q3 = data['LogReturn'].quantile(0.75)  # 75. percentyl

# Przekształcanie 'LogReturn' na 4 klasy na podstawie kwartylów
data_class['LogReturn'] = pd.cut(data['LogReturn'], bins=[-float('inf'), q1, q2, q3, float('inf')],
                                  labels=[0, 1, 2, 3])

# Konwersja kolumny 'date' na daty
data_class['date'] = pd.to_datetime(data_class['date'])

# Ograniczenie danych do lat < 2019
data_class = data_class.loc[data_class['date'].dt.year < 2019]

# Przygotowanie zmiennych X i Y
X = data_class.drop(['LogReturn', 'date'], axis=1)
Y = data_class['LogReturn']

# Podział na zestaw treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Inicjalizacja modelu CatBoost
model = CatBoostClassifier(iterations=1000, 
                           learning_rate=0.1, 
                           depth=6, 
                           verbose=200)

# Trenowanie modelu
model.fit(X_train, y_train)

# Predykcja
y_pred = model.predict(X_test)

# Ocena dokładności
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


0:	learn: 1.3270827	total: 22ms	remaining: 22s
200:	learn: 0.4889346	total: 1.97s	remaining: 7.83s
400:	learn: 0.2817987	total: 3.88s	remaining: 5.79s
600:	learn: 0.1750214	total: 5.92s	remaining: 3.93s
800:	learn: 0.1183106	total: 7.85s	remaining: 1.95s
999:	learn: 0.0867750	total: 9.77s	remaining: 0us
Accuracy: 57.73%
