In [1]:
# # Analyse des prix BTC par intervalles temporels
# Notebook pour agréger les données de prix BTC par minutes, heures, jours, mois et années

In [2]:
# ## 1. Importation des librairies
# Importation des packages nécessaires

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import ta
from ta.trend import EMAIndicator
from ta.trend import MACD
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import ADXIndicator
from ta.volume import MFIIndicator
import plotly.express as px
import nbformat
import plotly.graph_objects as go

In [3]:
df0 = pd.read_csv('btc_prices_part_0.csv', sep="\t")
df1 = pd.read_csv('btc_prices_part_1.csv', sep="\t")
df2 = pd.read_csv('btc_prices_part_2.csv', sep="\t")
df3 = pd.read_csv('btc_prices_part_3.csv', sep="\t")

In [4]:
df = pd.concat([df0,df1,df2,df3])
df["date"] = pd.to_datetime(df["date"])

In [5]:
df.head()

Unnamed: 0,date,open,high,low,close,volume
0,2018-12-15 04:11:00,3200.0,3200.0,3200.0,3200.0,0.003
1,2018-12-15 04:12:00,3000.0,3312.32,3000.0,3312.32,1.882
2,2018-12-15 04:13:00,3312.32,3312.32,3312.32,3312.32,0.0
3,2018-12-15 04:14:00,3312.32,3312.32,3312.32,3312.32,0.0
4,2018-12-15 04:15:00,3312.32,3312.32,3312.32,3312.32,0.0


In [6]:
# ## 2. Configuration des paramètres
# Définition des intervalles temporels et des colonnes associées

time_config = {
    'minutes': {
        'interval': '1m',  # Fréquence pandas pour le rééchantillonnage
        'table': 'btc_prices_minutes',
        'columns': ['date', 'open', 'high', 'low', 'close', 'volume', 'minute']
    },
    'heures': {
        'interval': '1h',
        'table': 'btc_prices_hours',
        'columns': ['date', 'open', 'high', 'low', 'close', 'volume', 'hour']
    },
    'jours': {
        'interval': '1d',
        'table': 'btc_prices_days',
        'columns': ['date', 'open', 'high', 'low', 'close', 'volume']
    },
    'mois': {
        'interval': '1M',
        'table': 'btc_prices_months',
        'columns': ['date', 'open', 'high', 'low', 'close', 'volume', 'month']
    },
    'années': {
        'interval': '1y',
        'table': 'btc_prices_years',
        'columns': ['date', 'open', 'high', 'low', 'close', 'volume', 'year']
    }
}

In [7]:
# ## 3. Fonction d'agrégation optimisée avec KPI
def aggregate_with_kpi(df, time_unit):
    """
    Agrège les données BTC (déjà en minutes) et calcule les KPI
    Args:
        df: DataFrame source avec colonnes [date, open, high, low, close, volume]
        time_unit: 'heures', 'jours', 'mois' ou 'années'
    Returns:
        DataFrame agrégé avec KPI
    """
    # Configuration des intervalles
    config = {
        'heures': ('1H', 'hour'),
        'jours': ('1D', 'day'), 
        'mois': ('1M', 'month'),
        'années': ('1Y', 'year')
    }.get(time_unit)
    
    if not config:
        raise ValueError(f"Unité temporelle non valide: {time_unit}")
    
    interval, time_col = config

    # Décomposition des dates / heures par colonnes

    df['Year'] = df['date'].dt.year
    df['Month'] = df['date'].dt.month
    df['Day'] = df['date'].dt.day
    df['Hour'] = df['date'].dt.hour
    df['Minute'] = df['date'].dt.minute
    
    # Agrégation des données de base
    df_grouped = df.set_index('date').resample(interval).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    }).dropna().reset_index()

    # Ajout des KPI

    df_grouped["ema_20"] = ta.trend.ema_indicator(
    close=df_grouped["close"],
    window=20,
    fillna=False)

    df_grouped["ema_7"] = ta.trend.ema_indicator(
    close=df_grouped["close"],
    window=7,
    fillna=False)

    df_grouped["ema_99"] = ta.trend.ema_indicator(
    close=df_grouped["close"],
    window=99,
    fillna=False)

    macd_indicator = ta.trend.MACD(
    close=df_grouped["close"],
    window_slow=26,
    window_fast=12,
    window_sign=9,
    fillna=False)

    df_grouped["macd"] = macd_indicator.macd()
    df_grouped["macd_signal"] = macd_indicator.macd_signal()

    df_grouped["rsi"] = ta.momentum.rsi(
    close=df_grouped["close"],
    window=14,
    fillna=False)

    boll = ta.volatility.BollingerBands(
    close=df_grouped["close"],
    window=20,
    window_dev=2,
    fillna=False)

    df_grouped["boll_b"] = boll.bollinger_pband()

    df_grouped["stoch_rsi"] = ta.momentum.stochrsi(
    close=df_grouped["close"],
    window=14,
    smooth1=3,
    smooth2=3,
    fillna=False)

    df_grouped["volume_ma20"] = df_grouped["volume"].rolling(window=20).mean()

    df_grouped["body_size"] = df_grouped["close"] - df_grouped["open"]
    df_grouped["upper_wick"] = df_grouped["high"] - df_grouped[["close", "open"]].max(axis=1)
    df_grouped["upper_wick"] = df_grouped["high"] - df_grouped[["close", "open"]].max(axis=1)

    df_grouped["pct_change_1"] = df_grouped["close"].pct_change(periods=1)
    df_grouped["pct_change_3"] = df_grouped["close"].pct_change(periods=3)
    df_grouped["pct_change_5"] = df_grouped["close"].pct_change(periods=5)

    # Création de la colonne Target
        
    df_grouped["shift_close"] = df_grouped["close"].shift(-1)
    df_grouped = df_grouped.loc[26:,:]
    df["date"] = pd.to_datetime(df["date"])
    
    return df_grouped

In [8]:
df_grouped = aggregate_with_kpi(df,"jours")

In [9]:
# Liste des colonnes de prix que tu veux décaler
columns_to_shift = ['open', 'high', 'low', 'volume']

# Décalage d'un jour pour chaque colonne
for col in columns_to_shift:
    df_grouped[f'shift_{col}'] = df_grouped[col].shift(-1)

# Supprimer les lignes avec NaN (à cause des shift)
df_grouped.dropna(inplace=True)


In [10]:
# ## 4. Préparation des données pour le Machine Learning

def prepare_data(df, target_col='shift_close', test_size=0.2, random_state=42):
    """
    Prépare les données pour le ML en créant des features temporelles
    et en séparant train/test.
    """
    # Création de features temporelles
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['week_of_year'] = df['date'].dt.isocalendar().week
    
    # Features et target
    features = df.drop("close",axis = 1).select_dtypes(include = "number").columns.to_list()
    
    X = df[features]
    y = df[target_col]
    
    # Séparation train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=False)
    
    # Normalisation
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler, features

In [11]:
X_train, X_test, y_train, y_test, scaler, features = prepare_data(df_grouped, target_col='shift_close', test_size=0.2, random_state=42)

RANDOM FOREST

In [12]:
# ## I. Random Forest avec Shift(CLOSE)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns1 = df_grouped.select_dtypes(include='number').columns.to_list()
X1 = df[xcolumns1].drop("close", axis = 1)
y1 = df['shift_close']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X1_train, y1_train)
y1_test_pred = model.predict(X1_test)

# 9. Prédire et évaluer

mse1 = mean_squared_error(y1_test, y1_test_pred)
rmse1 = np.sqrt(mse1)
print(f"RMSE1: {rmse1:.2f}")


RMSE1: 207.27


In [13]:
# ## II. Random Forest avec Shift(OPEN)

# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns2 = df_grouped.select_dtypes(include='number').columns.to_list()
X2 = df[xcolumns2].drop("open", axis = 1)
y2 = df['shift_open']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X2_train, y2_train)
y2_test_pred = model.predict(X2_test)

# 9. Prédire et évaluer

mse2 = mean_squared_error(y2_test, y2_test_pred)
rmse2 = np.sqrt(mse2)
print(f"RMSE2: {rmse2:.2f}")


RMSE2: 103.52


In [14]:
# ## III. Random Forest avec Shift(HIGH)

# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns3 = df_grouped.select_dtypes(include='number').columns.to_list()
X3 = df[xcolumns2].drop("high", axis = 1)
y3 = df['shift_high']

X3_train, X3_test, y3_train, y3_test = train_test_split(X3,y3,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X3_train, y3_train)
y3_test_pred = model.predict(X3_test)

# 9. Prédire et évaluer

mse3 = mean_squared_error(y3_test, y3_test_pred)
rmse3 = np.sqrt(mse3)
print(f"RMSE3: {rmse3:.2f}")

RMSE3: 152.57


In [15]:
# ## IV. Random Forest avec Shift(LOW)

# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns4 = df_grouped.select_dtypes(include='number').columns.to_list()
X4 = df[xcolumns4].drop("low", axis = 1)
y4 = df['shift_low']

X4_train, X4_test, y4_train, y4_test = train_test_split(X4,y4,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X4_train, y4_train)
y4_test_pred = model.predict(X4_test)

# 9. Prédire et évaluer

mse4 = mean_squared_error(y4_test, y4_test_pred)
rmse4 = np.sqrt(mse4)
print(f"RMSE4: {rmse4:.2f}")

RMSE4: 131.95


In [16]:
# ## V. Random Forest avec Shift(VOLUME)

# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns5 = df_grouped.select_dtypes(include='number').columns.to_list()
X5 = df[xcolumns5].drop("volume", axis = 1)
y5 = df['shift_volume']

X5_train, X5_test, y5_train, y5_test = train_test_split(X5,y5,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X5_train, y5_train)
y5_test_pred = model.predict(X5_test)

# 9. Prédire et évaluer

mse5 = mean_squared_error(y5_test, y5_test_pred)
rmse5 = np.sqrt(mse5)
print(f"RMSE5: {rmse5:.2f}")

RMSE5: 30.81


XGBOOST -> SHIFT

In [17]:
# I XGBOOST avec Shift(CLOSE)

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Données déjà préparées : 
df = df_grouped.copy()          
df.dropna(inplace=True)         

# 2. Séparer features / target
xcolumns6 = df_grouped.select_dtypes(include='number').columns.to_list()
X6 = df[xcolumns6].drop("close", axis=1)  
y6 = df['shift_close']                   

# 3. Train / Test split
X6_train, X6_test, y6_train, y6_test = train_test_split(
    X6, y6, train_size=0.8, random_state=42
)

# 4. Modèle XGBoost
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 5. Entraînement
xgb_model.fit(X6_train, y6_train)

# 6. Prédiction & évaluation
y6_test_pred = xgb_model.predict(X6_test)

mse6  = mean_squared_error(y6_test, y6_test_pred)
rmse6 = np.sqrt(mse1)
print(f"RMSE6 (XGBoost) : {rmse6:.2f}")



RMSE6 (XGBoost) : 207.27


In [18]:
# II XGBOOST avec Shift(OPEN)

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Données déjà préparées : 
df = df_grouped.copy()          
df.dropna(inplace=True)         

# 2. Séparer features / target
xcolumns7 = df_grouped.select_dtypes(include='number').columns.to_list()
X7 = df[xcolumns7].drop("open", axis=1)   
y7 = df['shift_open']                   

# 3. Train / Test split
X7_train, X7_test, y7_train, y7_test = train_test_split(
    X7, y7, train_size=0.8, random_state=42
)

# 4. Modèle XGBoost
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 5. Entraînement
xgb_model.fit(X7_train, y7_train)

# 6. Prédiction & évaluation
y7_test_pred = xgb_model.predict(X7_test)

mse7  = mean_squared_error(y7_test, y7_test_pred)
rmse7 = np.sqrt(mse7)
print(f"RMSE7 (XGBoost) : {rmse7:.2f}")



RMSE7 (XGBoost) : 256.94


In [19]:
# III XGBOOST avec Shift(HIGH)

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Données déjà préparées : 
df = df_grouped.copy()          
df.dropna(inplace=True)         

# 2. Séparer features / target
xcolumns8 = df_grouped.select_dtypes(include='number').columns.to_list()
X8 = df[xcolumns8].drop("high", axis=1)   
y8 = df['shift_high']                     

# 3. Train / Test split
X8_train, X8_test, y8_train, y8_test = train_test_split(
    X8, y8, train_size=0.8, random_state=42
)

# 4. Modèle XGBoost
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 5. Entraînement
xgb_model.fit(X8_train, y8_train)

# 6. Prédiction & évaluation
y8_test_pred = xgb_model.predict(X8_test)

mse8  = mean_squared_error(y8_test, y8_test_pred)
rmse8 = np.sqrt(mse8)
print(f"RMSE8 (XGBoost) : {rmse8:.2f}")



RMSE8 (XGBoost) : 246.78


In [20]:
# IV XGBOOST avec Shift(LOW)

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Données déjà préparées : 
df = df_grouped.copy()          
df.dropna(inplace=True)         

# 2. Séparer features / target
xcolumns9 = df_grouped.select_dtypes(include='number').columns.to_list()
X9 = df[xcolumns9].drop("low", axis=1)   
y9 = df['shift_low']                     

# 3. Train / Test split
X9_train, X9_test, y9_train, y9_test = train_test_split(
    X9, y9, train_size=0.8, random_state=42
)

# 4. Modèle XGBoost
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 5. Entraînement
xgb_model.fit(X9_train, y9_train)

# 6. Prédiction & évaluation
y9_test_pred = xgb_model.predict(X9_test)

mse9  = mean_squared_error(y9_test, y9_test_pred)
rmse9 = np.sqrt(mse9)
print(f"RMSE9 (XGBoost) : {rmse9:.2f}")



RMSE9 (XGBoost) : 272.51


In [21]:
# V XGBOOST avec Shift(VOLUME)

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Données déjà préparées : 
df = df_grouped.copy()          
df.dropna(inplace=True)         

# 2. Séparer features / target
xcolumns10 = df_grouped.select_dtypes(include='number').columns.to_list()
X10 = df[xcolumns10].drop("volume", axis=1)   
y10 = df['shift_volume']                

# 3. Train / Test split
X10_train, X10_test, y10_train, y10_test = train_test_split(
    X10, y10, train_size=0.8, random_state=42
)

# 4. Modèle XGBoost
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 5. Entraînement
xgb_model.fit(X10_train, y10_train)

# 6. Prédiction & évaluation
y10_test_pred = xgb_model.predict(X10_test)

mse10  = mean_squared_error(y10_test, y10_test_pred)
rmse10 = np.sqrt(mse10)
print(f"RMSE10 (XGBoost) : {rmse10:.2f}")



RMSE10 (XGBoost) : 205.71


GRID SEARCH

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# === Paramètres ===
test_size = 0.2
random_state = 42
model_type = 'xgboost'  # ou 'random_forest'

# 1. Sélection des colonnes numériques
features = df.drop(columns=['date']).select_dtypes(include='number').columns.tolist()
X = df[features]

# 2. Cible : valeur continue
y = df["shift_close"]  # ✅ Series 1D

# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, shuffle=False
)

# 4. Choix du modèle + grille de recherche
if model_type == 'random_forest':
    model = RandomForestRegressor(random_state=random_state)
    param_grid = {
        'clf__n_estimators': [50, 100, 200, 500],
        'clf__max_depth': [None, 10, 20],
        'clf__min_samples_split': [2, 5, 10],
    }
elif model_type == 'xgboost':
    model = XGBRegressor(random_state=random_state, objective='reg:squarederror')  # ✅ pas de mlogloss
    param_grid = {
        'clf__n_estimators': [50, 100, 200, 500],
        'clf__max_depth': [5, 10, 20],
        'clf__learning_rate': [0.1, 1],
    }

# 5. Pipeline
pipeline = Pipeline([
    ('clf', model)
])

# 6. GridSearchCV
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 7. Entraînement
grid.fit(X_train, y_train)

# 8. Évaluation
y_pred = grid.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("✅ Meilleur modèle trouvé")
print("Best params :", grid.best_params_)
print(f"RMSE        : {rmse:.3f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
✅ Meilleur modèle trouvé
Best params : {'clf__learning_rate': 0.1, 'clf__max_depth': 10, 'clf__n_estimators': 200}
RMSE        : 19145.539


In [23]:
# 2. Trier par date
df = df_grouped

# 5. Supprimer les lignes avec des NaN dues aux shifts
df.dropna(inplace=True)

# 6. Séparer les features et la target
xcolumns2 = df_grouped.select_dtypes(include='number').columns.to_list()
X2 = df[xcolumns2].drop("open", axis = 1)
y2 = df['shift_open']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,train_size = 0.8, random_state = 42)

# 8. Entraîner le modèle Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X2_train, y2_train)
y2_test_pred = model.predict(X2_test)

# 9. Prédire et évaluer

mse2 = mean_squared_error(y2_test, y2_test_pred)
rmse2 = np.sqrt(mse2)
print(f"RMSE2: {rmse2:.2f}")

RMSE2: 103.52


CHANDELIERS

In [25]:
import plotly.graph_objects as go

# Assurez-vous que 'Date' est bien au format datetime
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Créer le graphique en chandeliers
fig = go.Figure(data=[go.Candlestick(
    x=df['date'],
    open=df['open'],
    high=df['high'],
    low=df['low'],
    close=df['close'],
    increasing_line_color='green',
    decreasing_line_color='red',
    name='BTC'
)])

# Mise en forme
fig.update_layout(
    title='Graphique en Chandeliers BTC',
    xaxis_title='Date',
    yaxis_title='Prix (USDT)',
    xaxis_rangeslider_visible=False
)

fig.show()
