In [1]:
import pandas as pd
pd.options.display.max_columns=200
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
import pickle
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

In [2]:
data = pd.read_csv('../data/btc_data.csv', index_col=0)

In [3]:
def train_model(data, model, features, target, window_size):
      
      # Initialiser les listes pour stocker les prédictions et les vraies valeurs
      predictions = []
      actuals = []

      # Boucle à travers les données de la taille de la fenêtre jusqu'à la fin des données
      for i in range(window_size, len(data) - 1):
            # Diviser les données en ensembles d'entraînement et de test
            X_train = features.iloc[i-window_size:i, :]
            y_train = target.iloc[i-window_size:i]
            X_test = features.iloc[i:i+1, :]
            y_test = target.iloc[i]

            #Normaliser les données
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            # Entraîner un modèle
            model.fit(X_train, y_train)
            
            # Faire une prédiction
            prediction = model.predict(X_test)[0]
            
            # Stocker les prédictions et les vraies valeurs
            predictions.append(prediction)
            actuals.append(y_test)

      # Évaluer le modèle
      accuracy = accuracy_score(actuals, predictions)
      print(f'Model Accuracy: {accuracy * 100:.2f}%')

In [38]:
def earn_metric(predicted_probs, progressions, n_days, current_index):
    # Assurer qu'il y a assez de données pour calculer la métrique
    if current_index < n_days - 1:
        return None

    base = c = 1
    for j in range(current_index - n_days + 1, current_index + 1):
        c *= predicted_probs[j] * progressions[j] + (1 - predicted_probs[j])
        base *= progressions[j]
    return c / base



def train_model_proba_metric_save(data, model, features, target, window_size, n_days):


      # Initialiser les listes pour stocker les probaabilités prédites et les vraies valeurs
      predicted_probs = []
      progressions = []
      metric = []

      # Boucle à travers les données de la taille de la fenêtre jusqu'à la fin des données
      for i in range(window_size, len(data) - 1):
            # Diviser les données en ensembles d'entraînement et de test
            X_train = features.iloc[i-window_size:i, :]
            y_train = target.iloc[i-window_size:i]
            X_test = features.iloc[i:i+1, :]
            y_test = target.iloc[i]

            # Normaliser les données
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            # Entraîner un modèle
            model.fit(X_train, y_train)

            # Obtenir les probabilités prédites pour la classe positive
            prediction_prob = model.predict_proba(X_test)[:, 1]
            
            # Stocker les probabilités prédites et les vraies valeurs
            predicted_probs.extend(prediction_prob)
            
            # Récupérer la progression réelle
            progressions.append(data.iloc[i]['progression tomorrow']+1)

            if i >= window_size + n_days:
                  metric.append(earn_metric(predicted_probs, progressions, n_days, i))

            # Sauvegarder le modèle et le scaler dans un fichier
            model_filename = f'../models/xgboost_models/xgboost_{data.index[i]}.pkl'
            scaler_filename = f'../scalers/scaler_{data.index[i]}.pkl'
            pickle.dump(model, open(model_filename, 'wb'))
            pickle.dump(scaler, open(scaler_filename, 'wb'))
      
      return np.mean(metric) 

def train_model_proba_metric(data, model, features, target, window_size, n_days):

      # Initialiser les listes pour stocker les probaabilités prédites et les vraies valeurs
      predicted_probs = []
      progressions = []
      metric = []

      # Boucle à travers les données de la taille de la fenêtre jusqu'à la fin des données
      for i in range(window_size, len(data) - 1):
            # Diviser les données en ensembles d'entraînement et de test
            X_train = features.iloc[i-window_size:i, :]
            y_train = target.iloc[i-window_size:i]
            X_test = features.iloc[i:i+1, :]
            y_test = target.iloc[i]

            # Normaliser les données
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
            # Entraîner un modèle
            model.fit(X_train, y_train)

            # Obtenir les probabilités prédites pour la classe positive
            prediction_prob = model.predict_proba(X_test)[:, 1]
            
            # Stocker les probabilités prédites et les vraies valeurs
            predicted_probs.extend(prediction_prob)
            
            # Récupérer la progression réelle
            progressions.append(data.iloc[i]['progression tomorrow']+1)

            if i >= window_size + n_days:
                  metric.append(earn_metric(predicted_probs, progressions, n_days, i))
      
      return np.mean(metric) 

In [43]:
def train_cumulative_model(data, model, features, target, window_size, test_size, n_days):
    # Initialiser les listes pour stocker les scores de performance
    performance_scores = []

    # StandardScaler pour normaliser les features
    scaler = StandardScaler()

    # Initialiser l'ensemble d'entraînement
    X_train = features.iloc[:window_size, :]
    y_train = target.iloc[:window_size]

    # Listes pour stocker toutes les prédictions et progressions
    all_predictions = []
    all_progressions = []

    # Boucle à travers les données, en augmentant la taille de l'ensemble d'entraînement à chaque itération
    for i in range(window_size, len(data) - test_size):
        # Diviser les données en ensembles d'entraînement et de test
        X_test = features.iloc[i:i + test_size, :]
        y_test = target.iloc[i:i + test_size]

        # Normaliser les données
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Entraîner le modèle
        model.fit(X_train_scaled, y_train)

        # Faire des prédictions sur le jeu de test
        prediction = model.predict(X_test_scaled)[0]  # Prédiction pour un seul jour
        progression = data.iloc[i]['progression tomorrow'] + 1

        all_predictions.append(prediction)
        all_progressions.append(progression)

        # Appliquer la métrique si on a suffisamment de données
        if i >= window_size + n_days - 1:
            score = earn_metric(all_predictions, all_progressions, n_days, i - window_size)
            if score is not None:
                performance_scores.append(score)

        # Ajouter de nouvelles données à l'ensemble d'entraînement pour la prochaine itération
        X_train = features.iloc[:i + test_size, :]
        y_train = target.iloc[:i + test_size]

    # Calculer la performance moyenne
    average_performance = np.mean(performance_scores)

    # Sauvegarder le modèle et le scaler final
    pickle.dump(model, open('../models/final_model.pkl', 'wb'))
    pickle.dump(scaler, open('../scalers/final_scaler.pkl', 'wb'))

    return average_performance

In [44]:
# Sélectionner les caractéristiques et exclure la dernière ligne
features = data.drop(columns=['progression tomorrow', 'target', 'close', 'high', 'low', 'volumefrom', 'market_cap', 'difficulty']).iloc[:-1, :]
target = data['target'].iloc[:-1]
window_size = 1500
test_size = 1
n_days = 1

In [45]:
best_params_xgb = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

xgboost = XGBClassifier(**best_params_xgb, random_state = 42)

In [46]:
train_cumulative_model(data, xgboost, features, target, window_size, test_size, n_days)

1.003973437273559

In [10]:
train_model_proba_metric_save(data, xgboost, features, target, window_size, n_days)

1.002236645020043

In [29]:
data_raw = pd.read_csv('../data/data_raw.csv', index_col=0)
data_raw['target'] = np.where(data_raw['progression tomorrow'] > 0, 1, 0)
start_date = pd.to_datetime("2011-01-01")
data_raw.index = pd.to_datetime(data_raw.index)
data_raw = data_raw[data_raw.index >= start_date]
data_raw

Unnamed: 0_level_0,high,low,open,volumefrom,volumeto,close,progression daily,progression tomorrow,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-12-10,0.2040,0.1825,0.2000,1684.25,3.281700e+02,0.2040,,0.020000,1
2010-12-11,0.2280,0.1907,0.2040,5788.69,1.267200e+03,0.2280,0.020000,0.117647,1
2010-12-12,0.2280,0.2068,0.2280,1963.56,4.188000e+02,0.2200,0.117647,-0.035088,0
2010-12-13,0.2300,0.2100,0.2200,6415.59,1.425920e+03,0.2299,-0.035088,0.045000,1
2010-12-14,0.2468,0.2100,0.2299,10388.10,2.403830e+03,0.2467,0.045000,0.073075,1
...,...,...,...,...,...,...,...,...,...
2023-11-10,37536.4400,36342.2700,36704.1400,32545.30,1.206041e+09,37321.5100,0.029906,0.016820,1
2023-11-11,37417.2900,36701.8300,37321.5100,14920.50,5.532155e+08,37142.2200,0.016820,-0.004804,0
2023-11-12,37233.4500,36747.0500,37142.2200,9320.87,3.455680e+08,37079.7600,-0.004804,-0.001682,0
2023-11-13,37429.5200,36358.0000,37079.7600,24368.53,8.976713e+08,36482.5100,-0.001682,-0.016107,0


In [20]:
# Sélectionner les caractéristiques et exclure la dernière ligne
features = data_raw.drop(columns=['progression tomorrow', 'target', 'close', 'high', 'low', 'volumefrom']).iloc[:-1, :]
target = data['target']
window_size = 1500
n_days = 1

In [23]:
best_params_xgb = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

xgboost = XGBClassifier(**best_params_xgb, random_state = 42)

In [26]:
train_model_proba_metric(data_raw, xgboost, features, target, window_size, n_days)

0.9997088846261043

In [48]:
data_raw = pd.read_csv('../data/data_raw.csv', index_col=0)
data_raw['target'] = np.where(data_raw['progression tomorrow'] > 0, 1, 0)

In [49]:
def calculate_stoch_osc(btc_data):
      # Oscillateur Stochastique

      rolling_window = 14

      # Trouver le prix le plus bas et le plus élevé sur la période
      btc_data['rolling_low'] = btc_data['low'].rolling(window=rolling_window).min()
      btc_data['rolling_high'] = btc_data['high'].rolling(window=rolling_window).max()

      # Calcul du Stochastic Oscillator
      btc_data['k'] = 100 * ((btc_data['open'] - btc_data['rolling_low']) / (btc_data['rolling_high'] - btc_data['rolling_low']))

      # Suppression des colonnes intermédiaires
      btc_data.drop(columns=['rolling_low', 'rolling_high'], inplace=True)

def calculate_momentum(btc_data):
      # Momentum

      n_days = 10

      # Calcul du Momentum
      btc_data['momentum'] = btc_data['open'] - btc_data['open'].shift(n_days)

def calculate_atr(btc_data):
      # Calcul de l'ATR (Average True Range)

      # Calculer la différence de prix de clôture par rapport à la journée précédente
      btc_data['prev_close'] = btc_data['open'].shift(1)

      # Calculer les trois composantes du True Range
      btc_data['high_minus_low'] = btc_data['high'] - btc_data['low']
      btc_data['high_minus_prev_close'] = abs(btc_data['high'] - btc_data['prev_close'])
      btc_data['low_minus_prev_close'] = abs(btc_data['low'] - btc_data['prev_close'])

      # Déterminer le True Range comme étant le maximum des trois valeurs précédentes
      btc_data['tr'] = btc_data[['high_minus_low', 'high_minus_prev_close', 'low_minus_prev_close']].max(axis=1)

      # Calculer l'ATR comme étant la moyenne mobile du TR sur une période de 14 jours
      rolling_window = 14
      btc_data['atr'] = btc_data['tr'].rolling(window=rolling_window).mean()

      # Supprimer les colonnes intermédiaires
      columns_to_drop = ['prev_close', 'high_minus_low', 'high_minus_prev_close', 'low_minus_prev_close', 'tr']
      btc_data.drop(columns=columns_to_drop, inplace=True)

def calculate_rsi(btc_data):
      # Calcul du RSI 

      # Calculer la différence de prix par rapport à la journée précédente
      btc_data['delta'] = btc_data['open'].diff()

      # Identifier les gains et les pertes
      btc_data['gain'] = btc_data['delta'].where(btc_data['delta'] > 0, 0)
      btc_data['loss'] = -btc_data['delta'].where(btc_data['delta'] < 0, 0)

      # Calculer la moyenne des gains et des pertes sur 14 jours
      rolling_window = 14
      btc_data['avg_gain'] = btc_data['gain'].rolling(window=rolling_window).mean()
      btc_data['avg_loss'] = btc_data['loss'].rolling(window=rolling_window).mean()

      # Calculer le RS (Relative Strength)
      btc_data['rs'] = btc_data['avg_gain'] / btc_data['avg_loss']

      # Calculer le RSI
      btc_data['rsi'] = 100 - (100 / (1 + btc_data['rs']))

      # Supprimer les colonnes intermédiaires
      btc_data.drop(columns=['delta', 'gain', 'loss', 'avg_gain', 'avg_loss', 'rs'], inplace=True)

calculate_momentum(data_raw)
calculate_stoch_osc(data_raw)
calculate_atr(data_raw)
calculate_rsi(data_raw)

start_date = pd.to_datetime("2011-01-01")
data_raw.index = pd.to_datetime(data_raw.index)
data_raw = data_raw[data_raw.index >= start_date]

In [50]:
data_raw

Unnamed: 0_level_0,high,low,open,volumefrom,volumeto,close,progression daily,progression tomorrow,target,momentum,k,atr,rsi
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01,0.3000,0.292,0.3000,2821.24,8.419500e+02,0.3000,0.000000,0.000000,0,0.060,98.360656,0.018136,75.041736
2011-01-02,0.3000,0.289,0.3000,5352.11,1.584660e+03,0.3000,0.000000,0.000000,0,0.050,98.360656,0.018214,74.831650
2011-01-03,0.3000,0.290,0.3000,1425.19,4.208500e+02,0.2950,0.000000,-0.016667,0,0.050,98.360656,0.016429,75.402884
2011-01-04,0.2999,0.289,0.2950,1879.00,5.483300e+02,0.2989,-0.016667,0.013220,1,0.047,90.163934,0.015286,64.583333
2011-01-05,0.2990,0.290,0.2989,357.16,1.061900e+02,0.2990,0.013220,0.000335,1,0.049,96.500000,0.014000,90.397805
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-10,37536.4400,36342.270,36704.1400,32545.30,1.206041e+09,37321.5100,0.029906,0.016820,1,2209.580,68.735235,1173.541429,77.472798
2023-11-11,37417.2900,36701.830,37321.5100,14920.50,5.532155e+08,37142.2200,0.016820,-0.004804,0,2663.780,83.451695,1180.479286,84.113197
2023-11-12,37233.4500,36747.050,37142.2200,9320.87,3.455680e+08,37079.7600,-0.004804,-0.001682,0,1702.460,78.426595,1160.555714,80.518099
2023-11-13,37429.5200,36358.000,37079.7600,24368.53,8.976713e+08,36482.5100,-0.001682,-0.016107,0,2132.610,76.838135,1181.945000,77.550176


In [51]:
# Sélectionner les caractéristiques et exclure la dernière ligne
features_raw = data_raw.drop(columns=['progression tomorrow', 'target', 'close', 'high', 'low', 'volumefrom']).iloc[:-1, :]
target = data_raw['target'].iloc[:-1]
window_size = 1500
n_days = 1

In [52]:
best_params_xgb = {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

xgboost = XGBClassifier(**best_params_xgb, random_state = 42)

In [53]:
train_model_proba_metric(data_raw, xgboost, features, target, window_size, n_days)

1.0008931794508802

In [None]:
param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
from IPython.display import display, HTML


# Créer une liste de combinaisons de paramètres
grid_list = list(ParameterGrid(param_grid_xgb))

# Pour boucler sur chaque combinaison :
best_accuracy = 0
best_params = None

for params in grid_list:
    try:
        # Instancier le modèle avec les paramètres
        model_instance = XGBClassifier(**params)
        
        # Appliquer la fonction train_model
        accuracy = train_model_grid(data, model_instance, features, target, window_size)
        
        # Si le modèle actuel a une meilleure précision que le précédent meilleur modèle, stocker sa précision et ses paramètres
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            print(f"Nouveaux meilleurs paramètres trouvés : {params}, accuracy : {accuracy}")

        else:
            print(f'Trop nul la honte : {params}, accuracy : {accuracy}"')

        
            
    except Exception as e:
        # Gérer les combinaisons de paramètres non compatibles
        error_message = f"Error with parameters {params}: {e}"
        

print(f"Best Model Accuracy: {best_accuracy * 100:.5f}%")
print(f"Best Parameters: {best_params}")

Nouveaux meilleurs paramètres trouvés : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}, accuracy : 0.5887799564270153
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}, accuracy : 0.5833333333333334"
Nouveaux meilleurs paramètres trouvés : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}, accuracy : 0.5999455337690632
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}, accuracy : 0.5923202614379085"
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}, accuracy : 0.593681917211329"
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}, accuracy : 0.5874183006535948"
Nouveaux meilleurs paramètr

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
from IPython.display import display, HTML


# Créer une liste de combinaisons de paramètres
grid_list = list(ParameterGrid(param_grid_xgb))

# Pour boucler sur chaque combinaison :
best_earning = 0
best_params = None

for params in grid_list:
    try:
        # Instancier le modèle avec les paramètres
        model_instance = XGBClassifier(**params, random_state = 42)
        
        # Appliquer la fonction train_model
        earning = train_model_proba_metric(data, model_instance, features, target, window_size, n_days)
        
        # Si le modèle actuel a une meilleure précision que le précédent meilleur modèle, stocker sa précision et ses paramètres
        if earning > best_earning:
            best_earning = earning
            best_params = params
            print(f"Nouveaux meilleurs paramètres trouvés : {params}, earning : {earning}")

        else:
            print(f'Trop nul la honte : {params}, earning : {earning}')

        
            
    except Exception as e:
        # Gérer les combinaisons de paramètres non compatibles
        error_message = f"Error with parameters {params}: {e}"
        

print(f"Best Earnings: {best_earning * 100:.5f}%")
print(f"Best Parameters: {best_params}")

Nouveaux meilleurs paramètres trouvés : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}, earning : 1.0084009335278077
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}, earning : 1.0083829500965855"
Nouveaux meilleurs paramètres trouvés : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}, earning : 1.0205983479391452
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}, earning : 1.020363818651446"
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}, earning : 1.013274166001476"
Trop nul la honte : {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}, earning : 1.0135417707667784"
Nouveaux meilleurs paramètres trou