# Analysis of Machine Learning Effectiveness in Decentralized Financial Markets
**Author:** Paweł Winnik  
**Project:** Bachelor's Thesis Implementation  
**Data Range:** 2015-01-01 to 2023-12-31  

## Project Overview
The application compares various ML models in predicting Bitcoin price movements using technical indicators.

In [None]:
import numpy as np
import yfinance as yf
import talib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

# ==========================================
# 1. KONFIGURACJA (POPRAWIONA)
# ==========================================
CONFIG = {
    'ticker': 'BTC-USD',
    'period': 'max',
    'commission': 0.0015,     # 0.15% prowizji
    'initial_capital': 10000, # Kapitał początkowy (np. 10,000 USD) - BRAKUJĄCY KLUCZ DODANY
    'test_size': 0.2,         # 20% danych na test
}

# Parametry wskaźników technicznych 
PARAMS_DAILY = {
    'sma_windows': [5, 10, 20, 50],
    'rsi_window': 14,
    'macd_fast': 12, 'macd_slow': 26, 'macd_signal': 9,
    'volatility_window': 5,
    'atr_window': 14
}

# Ustawienia estetyczne wykresów
sns.set_theme(style="whitegrid", context="paper", font_scale=1.1)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'sans-serif'

MODEL_COLORS = {
    'LogisticRegression': '#1f77b4',
    'RandomForest': '#2ca02c',
    'SVM': '#ff7f0e',
    'MLP': '#9467bd',
    'LSTM': '#d62728',
    'Buy & Hold': '#7f7f7f'
}

# ==========================================
# 2. MODUŁ WIZUALIZACJI (THESIS READY)
# ==========================================
class ThesisVisualizer:
    def __init__(self, save_dir='plots/'):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    def save_and_show(self, filename):
        path = os.path.join(self.save_dir, filename)
        plt.tight_layout()
        plt.savefig(path, bbox_inches='tight')
        print(f"Zapisano wykres: {path}")
        plt.show()

    def plot_confusion_matrices(self, results_dict):
        """Generuje Rys. 53-55"""
        models = [m for m in results_dict.keys() if m != 'Buy & Hold']
        n_models = len(models)
        cols = 3
        rows = (n_models // cols) + (1 if n_models % cols > 0 else 0)
        
        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
        axes = axes.flatten()
        
        for i, model_name in enumerate(models):
            if 'cm' in results_dict[model_name]:
                cm = results_dict[model_name]['cm']
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar=False,
                            annot_kws={"size": 12, "weight": "bold"})
                axes[i].set_title(f'Macierz pomyłek: {model_name}', fontsize=12, weight='bold')
                axes[i].set_xlabel('Przewidziana')
                axes[i].set_ylabel('Rzeczywista')
                axes[i].set_xticklabels(['Spadek', 'Wzrost'])
                axes[i].set_yticklabels(['Spadek', 'Wzrost'])
        
        # Ukrycie pustych osi
        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        self.save_and_show("confusion_matrices.png")

    def plot_metrics_comparison(self, results_df):
        """Generuje Rys. 58-62"""
        metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
        df_plot = results_df[results_df['Model'] != 'Buy & Hold'].copy()
        
        df_melted = df_plot.melt(id_vars='Model', value_vars=metrics, var_name='Metryka', value_name='Wartość')
        
        plt.figure(figsize=(12, 7))
        ax = sns.barplot(x='Metryka', y='Wartość', hue='Model', data=df_melted, palette=MODEL_COLORS, edgecolor='white')
        
        plt.title('Porównanie skuteczności modeli (Metryki klasyfikacji)', fontsize=14, pad=15)
        plt.ylim(0.4, 1.05) # Skalowanie dla lepszej widoczności
        plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
        
        # Wartości na słupkach
        for container in ax.containers:
            ax.bar_label(container, fmt='%.2f', padding=3, fontsize=8)
            
        self.save_and_show("metrics_comparison.png")

    def plot_financial_comparison(self, results_df):
        """Generuje Rys. 56-57"""
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        
        # Sharpe Ratio
        sns.barplot(x='Model', y='Sharpe Ratio', data=results_df, ax=axes[0], palette=MODEL_COLORS)
        axes[0].set_title('Sharpe Ratio', fontsize=12, weight='bold')
        axes[0].tick_params(axis='x', rotation=45)
        for container in axes[0].containers:
            axes[0].bar_label(container, fmt='%.2f', padding=3)

        # Total Return
        sns.barplot(x='Model', y='Total Return [%]', data=results_df, ax=axes[1], palette=MODEL_COLORS)
        axes[1].set_title('Total Return [%]', fontsize=12, weight='bold')
        axes[1].tick_params(axis='x', rotation=45)
        for container in axes[1].containers:
            axes[1].bar_label(container, fmt='%.1f', padding=3)
            
        self.save_and_show("financial_metrics.png")

    def plot_equity_curves(self, equity_data, dates):
        """Generuje Rys. 63"""
        plt.figure(figsize=(12, 7))
        for model_name, curve in equity_data.items():
            color = MODEL_COLORS.get(model_name, '#333333')
            # Dopasowanie długości wektora dat do krzywej kapitału
            plot_dates = dates[-len(curve):]
            plt.plot(plot_dates, curve, label=model_name, linewidth=2, color=color, alpha=0.8)
            
        plt.title('Symulacja krzywej kapitału (Equity Curve)', fontsize=14, pad=15)
        plt.ylabel('Wartość portfela [USD]')
        plt.xlabel('Data')
        plt.legend()
        plt.grid(True, which='major', linestyle='--', alpha=0.6)
        self.save_and_show("equity_curves.png")

# ==========================================
# 3. PRZETWARZANIE DANYCH
# ==========================================
class DataHandler:
    @staticmethod
    def get_data(ticker):
        print(f"--> Pobieranie danych dla {ticker}...")
        try:
            df = yf.download(ticker, period="max", progress=False)
            # Obsługa nowego formatu yfinance (MultiIndex)
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.get_level_values(0)
            
            # Jeśli nadal są problemy z indeksowaniem kolumn po dacie
            if 'Close' not in df.columns:
                print("Błąd struktury danych. Sprawdź wersję yfinance.")
                return None
                
            df = df[['Open', 'High', 'Low', 'Close', 'Volume']].ffill()
            return df
        except Exception as e:
            print(f"Błąd pobierania danych: {e}")
            return None

    @staticmethod
    def add_features(df, p):
        data = df.copy()
        # Wskaźniki techniczne (TA-Lib)
        data['log_return'] = np.log(data['Close'] / data['Close'].shift(1))
        data['volatility'] = data['log_return'].rolling(p['volatility_window']).std()
        data['atr'] = talib.ATR(data['High'], data['Low'], data['Close'], timeperiod=p['atr_window'])
        
        for w in p['sma_windows']:
            data[f'sma_{w}'] = talib.SMA(data['Close'], timeperiod=w)
            
        data['rsi'] = talib.RSI(data['Close'], timeperiod=p['rsi_window'])
        
        macd, macd_sig, _ = talib.MACD(data['Close'], 
                                       fastperiod=p['macd_fast'], 
                                       slowperiod=p['macd_slow'], 
                                       signalperiod=p['macd_signal'])
        data['macd'] = macd
        data['macd_signal'] = macd_sig
        
        # Zmienna celu (Target): 1 jeśli Close[t+1] > Close[t]
        data['target'] = (data['Close'].shift(-1) > data['Close']).astype(int)
        
        data.dropna(inplace=True)
        return data

# ==========================================
# 4. BACKTESTING
# ==========================================
def run_backtest(prices, predictions, initial_capital, commission):
    capital = initial_capital
    position = 0 
    equity_curve = [initial_capital]
    
    # prices i predictions muszą mieć tę samą długość w pętli
    # predictions[i] to decyzja podjęta w dniu i (na podstawie danych do i)
    # realizowana po cenie zamknięcia i (lub otwarcia i+1, tu uproszczone do close)
    
    for i in range(len(predictions) - 1):
        curr_price = prices.iloc[i]
        next_price = prices.iloc[i+1]
        signal = predictions[i]
        
        # Logika: 1 = Kup/Trzymaj, 0 = Sprzedaj/Gotówka
        if signal == 1 and position == 0: 
            # Kupno za całą gotówkę (pomniejszone o prowizję)
            position = (capital * (1 - commission)) / curr_price
            capital = 0
        elif signal == 0 and position > 0: 
            # Sprzedaż wszystkiego
            capital = position * curr_price * (1 - commission)
            position = 0
            
        # Wycena portfela na koniec dnia
        current_equity = capital + (position * next_price)
        equity_curve.append(current_equity)
        
    final_equity = equity_curve[-1]
    
    # Obliczanie metryk
    equity_series = pd.Series(equity_curve)
    returns = equity_series.pct_change().dropna()
    
    total_ret = (final_equity - initial_capital) / initial_capital * 100
    sharpe = (returns.mean() / returns.std()) * np.sqrt(252) if returns.std() != 0 else 0
    
    return total_ret, sharpe, equity_curve

# ==========================================
# 5. GŁÓWNA PĘTLA (MAIN)
# ==========================================
def main():
    # 1. Pobieranie danych
    df_raw = DataHandler.get_data(CONFIG['ticker'])
    if df_raw is None: return

    # 2. Inżynieria cech
    df = DataHandler.add_features(df_raw, PARAMS_DAILY)
    
    # 3. Podział danych (Train / Test)
    split_idx = int(len(df) * (1 - CONFIG['test_size']))
    
    feature_cols = [c for c in df.columns if c not in ['Open', 'High', 'Low', 'Close', 'Volume', 'target']]
    X = df[feature_cols]
    y = df['target']
    
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    # Ceny do backtestu (muszą odpowiadać indeksom X_test)
    prices_test = df['Close'].iloc[split_idx:]
    dates_test = df.index[split_idx:]
    
    # Skalowanie
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    
    results_list = []
    equity_curves = {}
    full_results_data = {} # Dla macierzy pomyłek
    
    # --- MODEL REFERENCYJNY: BUY & HOLD ---
    # Obliczamy ile kupilibyśmy BTC na początku okresu testowego
    initial_btc = (CONFIG['initial_capital'] * (1 - CONFIG['commission'])) / prices_test.iloc[0]
    equity_bh = initial_btc * prices_test
    
    equity_curves['Buy & Hold'] = equity_bh.values
    
    bh_total_ret = (equity_bh.iloc[-1] - CONFIG['initial_capital']) / CONFIG['initial_capital'] * 100
    # Sharpe dla B&H
    bh_returns = equity_bh.pct_change().dropna()
    bh_sharpe = (bh_returns.mean() / bh_returns.std()) * np.sqrt(252) if bh_returns.std() != 0 else 0

    results_list.append({
        'Model': 'Buy & Hold',
        'Accuracy': 0, 'Precision': 0, 'Recall': 0, 'F1-Score': 0, 'AUC': 0,
        'Total Return [%]': bh_total_ret, 
        'Sharpe Ratio': bh_sharpe
    })

    # --- MODELE KLASYCZNE (SKLEARN) ---
    models_sklearn = {
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
        'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
        'SVM': SVC(probability=True, kernel='rbf', random_state=42)
    }
    
    for name, model in models_sklearn.items():
        print(f"--> Trenowanie modelu: {name}...")
        model.fit(X_train_s, y_train)
        
        y_pred = model.predict(X_test_s)
        y_prob = model.predict_proba(X_test_s)[:, 1]
        
        # Metryki ML
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_prob)
        
        # Backtest
        ret, sharpe, eq_curve = run_backtest(prices_test, y_pred, CONFIG['initial_capital'], CONFIG['commission'])
        
        results_list.append({
            'Model': name,
            'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1-Score': f1, 'AUC': auc,
            'Total Return [%]': ret, 'Sharpe Ratio': sharpe
        })
        equity_curves[name] = eq_curve
        full_results_data[name] = {'cm': confusion_matrix(y_test, y_pred)}

    # --- MODELE SIECI NEURONOWYCH (KERAS) ---
    
    # 1. MLP
    print("--> Trenowanie modelu: MLP...")
    mlp = Sequential([
        Input(shape=(X_train_s.shape[1],)),
        Dense(64, activation='relu'), Dropout(0.5),
        Dense(32, activation='relu'), Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    mlp.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    # Early Stopping dla przyspieszenia i uniknięcia overfittingu
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    mlp.fit(X_train_s, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0, callbacks=[es])
    
    y_prob_mlp = mlp.predict(X_test_s).flatten()
    y_pred_mlp = (y_prob_mlp > 0.5).astype(int)
    
    ret_mlp, sharpe_mlp, eq_mlp = run_backtest(prices_test, y_pred_mlp, CONFIG['initial_capital'], CONFIG['commission'])
    
    results_list.append({
        'Model': 'MLP',
        'Accuracy': accuracy_score(y_test, y_pred_mlp),
        'Precision': precision_score(y_test, y_pred_mlp, zero_division=0),
        'Recall': recall_score(y_test, y_pred_mlp, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred_mlp, zero_division=0),
        'AUC': roc_auc_score(y_test, y_prob_mlp),
        'Total Return [%]': ret_mlp, 'Sharpe Ratio': sharpe_mlp
    })
    equity_curves['MLP'] = eq_mlp
    full_results_data['MLP'] = {'cm': confusion_matrix(y_test, y_pred_mlp)}
    
    # 2. LSTM
    print("--> Trenowanie modelu: LSTM...")
    # Reshape pod LSTM [samples, time steps, features]
    X_train_lstm = X_train_s.reshape((X_train_s.shape[0], 1, X_train_s.shape[1]))
    X_test_lstm = X_test_s.reshape((X_test_s.shape[0], 1, X_test_s.shape[1]))
    
    lstm = Sequential([
        Input(shape=(1, X_train_s.shape[1])),
        LSTM(50), Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    lstm.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    lstm.fit(X_train_lstm, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=0, callbacks=[es])
    
    y_prob_lstm = lstm.predict(X_test_lstm).flatten()
    y_pred_lstm = (y_prob_lstm > 0.5).astype(int)
    
    ret_lstm, sharpe_lstm, eq_lstm = run_backtest(prices_test, y_pred_lstm, CONFIG['initial_capital'], CONFIG['commission'])
    
    results_list.append({
        'Model': 'LSTM',
        'Accuracy': accuracy_score(y_test, y_pred_lstm),
        'Precision': precision_score(y_test, y_pred_lstm, zero_division=0),
        'Recall': recall_score(y_test, y_pred_lstm, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred_lstm, zero_division=0),
        'AUC': roc_auc_score(y_test, y_prob_lstm),
        'Total Return [%]': ret_lstm, 'Sharpe Ratio': sharpe_lstm
    })
    equity_curves['LSTM'] = eq_lstm
    full_results_data['LSTM'] = {'cm': confusion_matrix(y_test, y_pred_lstm)}

    # ==========================================
    # 6. RAPORTOWANIE I WIZUALIZACJA
    # ==========================================
    df_results = pd.DataFrame(results_list)
    print("\n=== PODSUMOWANIE WYNIKÓW ===")
    print(df_results.round(4).to_string(index=False))
    
    print("\nGenerowanie wykresów w folderze /plots...")
    viz = ThesisVisualizer()
    
    viz.plot_confusion_matrices(full_results_data)
    viz.plot_metrics_comparison(df_results)
    viz.plot_financial_comparison(df_results)
    viz.plot_equity_curves(equity_curves, dates_test)
    
    print("Zakończono pomyślnie.")

if __name__ == "__main__":
    main()