### Import des librairies

In [488]:
import pickle
import pandas as pd
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

### Récupération des données

In [368]:
# Récupération du fichier de données
with open("magnific7_1day.pkl", "rb") as f:
    data = pickle.load(f)

In [370]:
df = data["APPLE"]
df

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,7.62,7.66,7.58,7.64,493729600
2010-01-05,7.66,7.70,7.62,7.66,601904800
2010-01-06,7.66,7.69,7.53,7.53,552160000
2010-01-07,7.56,7.57,7.47,7.52,477131200
2010-01-08,7.51,7.57,7.47,7.57,447876740
...,...,...,...,...,...
2025-04-14,211.44,212.94,201.16,202.52,101352911
2025-04-15,201.86,203.51,199.80,202.14,51343900
2025-04-16,198.36,200.70,192.37,194.27,59732423
2025-04-17,197.20,198.83,194.42,196.98,52164675


### Random Forest sur les données Apple

In [373]:
# Création de la target
df['close_nextday'] = df['close'].shift(-1)
df = df.dropna()
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,close_nextday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,7.62,7.66,7.58,7.64,493729600,7.66
2010-01-05,7.66,7.7,7.62,7.66,601904800,7.53
2010-01-06,7.66,7.69,7.53,7.53,552160000,7.52
2010-01-07,7.56,7.57,7.47,7.52,477131200,7.57
2010-01-08,7.51,7.57,7.47,7.57,447876740,7.5


In [375]:
df.describe()

Unnamed: 0,open,high,low,close,volume,close_nextday
count,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0
mean,72.659327,73.44607,71.921352,72.720003,226788400.0,72.768227
std,68.046162,68.806801,67.361455,68.127302,217297900.0,68.146881
min,6.87,7.0,6.79,6.86,23234700.0,6.86
25%,20.81,21.015,20.54,20.84,83383580.0,20.86
50%,38.55,38.86,38.32,38.56,137463200.0,38.61
75%,134.08,135.225,132.68,134.06,302913800.0,134.15
max,258.19,260.1,257.63,259.02,1880998000.0,259.02


In [475]:
features = ['open', 'high', 'low', 'close', 'volume']
target = 'close_nextday'

X = df[features]
y = df[target]


In [477]:
# Création du train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [479]:
# Standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [481]:
# Entraînement
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


In [483]:
# Evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")


MAE  : 17.53
MSE  : 780.16
RMSE : 27.93


### Feature engineering

In [389]:
df = df.copy()
df['delta_3'] = df['close'] - df['close'].shift(3)
df['delta_10'] = df['close'] - df['close'].shift(10)
df['delta_30'] = df['close'] - df['close'].shift(30)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,close_nextday,delta_3,delta_10,delta_30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,7.62,7.66,7.58,7.64,493729600,7.66,,,
2010-01-05,7.66,7.7,7.62,7.66,601904800,7.53,,,
2010-01-06,7.66,7.69,7.53,7.53,552160000,7.52,,,
2010-01-07,7.56,7.57,7.47,7.52,477131200,7.57,-0.12,,
2010-01-08,7.51,7.57,7.47,7.57,447876740,7.5,-0.09,,


In [391]:
# Moyenne mobile sur 20 jours
df['ma20'] = df['close'].rolling(window=20).mean()
df['ma20_3'] = df['ma20'] - df['ma20'].shift(3)
df['ma20_10'] = df['ma20'] - df['ma20'].shift(10)
df['ma20_30'] = df['ma20'] - df['ma20'].shift(30)
# Écart-type sur 20 jours
df['std20'] = df['close'].rolling(window=20).std()
df['std20_3'] = df['std20'] - df['std20'].shift(3)
df['std20_10'] = df['std20'] - df['std20'].shift(10)
df['std20_30'] = df['std20'] - df['std20'].shift(30)
# Bandes de Bollinger
df['bollinger_high'] = df['ma20'] + 2 * df['std20']
df['bollinger_high_3'] = df['bollinger_high'] - df['bollinger_high'].shift(3)
df['bollinger_high_10'] = df['bollinger_high'] - df['bollinger_high'].shift(10)
df['bollinger_high_30'] = df['bollinger_high'] - df['bollinger_high'].shift(30)
df['bollinger_low'] = df['ma20'] - 2 * df['std20']
df['bollinger_low_3'] = df['bollinger_low'] - df['bollinger_low'].shift(3)
df['bollinger_low_10'] = df['bollinger_low'] - df['bollinger_low'].shift(10)
df['bollinger_low_30'] = df['bollinger_low'] - df['bollinger_low'].shift(30)


In [393]:
# RSI
delta = df['close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

rs = avg_gain / avg_loss
df['rsi'] = 100 - (100 / (1 + rs))


In [395]:
df['rsi_3'] = df['rsi'] - df['rsi'].shift(3)
df['rsi_10'] = df['rsi'] - df['rsi'].shift(10)
df['rsi_30'] = df['rsi'] - df['rsi'].shift(30)

In [397]:
df.dropna(inplace=True)

In [399]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'close_nextday', 'delta_3',
       'delta_10', 'delta_30', 'ma20', 'ma20_3', 'ma20_10', 'ma20_30', 'std20',
       'std20_3', 'std20_10', 'std20_30', 'bollinger_high', 'bollinger_high_3',
       'bollinger_high_10', 'bollinger_high_30', 'bollinger_low',
       'bollinger_low_3', 'bollinger_low_10', 'bollinger_low_30', 'rsi',
       'rsi_3', 'rsi_10', 'rsi_30'],
      dtype='object')

In [401]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,close_nextday,delta_3,delta_10,delta_30,ma20,...,bollinger_high_10,bollinger_high_30,bollinger_low,bollinger_low_3,bollinger_low_10,bollinger_low_30,rsi,rsi_3,rsi_10,rsi_30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-03-16,8.01,8.03,7.95,8.02,446908000,8.0,-0.03,0.56,1.07,7.564,...,0.82842,0.437849,6.829485,0.048976,0.01258,-0.096849,90.47619,4.151404,18.198963,52.380952
2010-03-17,8.03,8.09,7.97,8.0,450956800,8.02,-0.09,0.52,1.01,7.6025,...,0.825254,0.497932,6.861011,0.072741,0.043746,-0.014932,88.349515,-1.394075,12.591939,47.82088
2010-03-18,8.0,8.04,7.95,8.02,342109600,7.94,0.03,0.49,0.9,7.641,...,0.811719,0.576388,6.89662,0.084141,0.093281,0.037612,87.368421,-3.935927,13.742047,46.064073
2010-03-19,8.03,8.04,7.9,7.94,559445600,8.03,-0.08,0.12,1.08,7.678,...,0.719821,0.592902,6.952593,0.123108,0.163179,0.162098,77.272727,-13.203463,-1.674641,39.574315
2010-03-22,7.87,8.07,7.86,8.03,456419600,8.16,0.03,0.21,1.05,7.7215,...,0.641173,0.628272,7.023042,0.162031,0.244827,0.267728,79.381443,-8.968071,2.458366,36.751961


### Réentraînement après feature engineering

In [404]:
features = ['open', 'high', 'low', 'close', 'volume', 'delta_3',
       'delta_10', 'delta_30', 'ma20', 'ma20_3', 'ma20_10', 'ma20_30', 'std20',
       'std20_3', 'std20_10', 'std20_30', 'bollinger_high', 'bollinger_high_3',
       'bollinger_high_10', 'bollinger_high_30', 'bollinger_low',
       'bollinger_low_3', 'bollinger_low_10', 'bollinger_low_30', 'rsi',
       'rsi_3', 'rsi_10', 'rsi_30']

target = 'close_nextday'

X = df[features]
y = df[target]


In [406]:
# Création du train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [408]:
# Standardisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [410]:
# Entraînement
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


In [411]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 19.09
MSE  : 880.81
RMSE : 29.68


### Essais de différents paramètres de RandomForestRegressor

In [415]:
# Entraînement  n_estimators = 300 
model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train_scaled, y_train)


In [417]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 19.05
MSE  : 878.02
RMSE : 29.63


In [419]:
# n_estimators = 500
model = RandomForestRegressor(n_estimators=500, random_state=42)
model.fit(X_train_scaled, y_train)

In [420]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 19.08
MSE  : 879.71
RMSE : 29.66


In [423]:
# Max_depth = 5
model = RandomForestRegressor(n_estimators=100, max_depth =5, random_state=42)
model.fit(X_train_scaled, y_train)


In [425]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 18.73
MSE  : 857.58
RMSE : 29.28


In [427]:
# Max_depth = 10
model = RandomForestRegressor(n_estimators=100, max_depth =10, random_state=42)
model.fit(X_train_scaled, y_train)


In [429]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 19.16
MSE  : 885.21
RMSE : 29.75


In [431]:
# Max_depth = 20
model = RandomForestRegressor(n_estimators=100, max_depth =20, random_state=42)
model.fit(X_train_scaled, y_train)


In [433]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 19.18
MSE  : 886.87
RMSE : 29.78


In [435]:
# min_samples_split = 5
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 5, random_state=42)
model.fit(X_train_scaled, y_train)

In [437]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 18.72
MSE  : 856.58
RMSE : 29.27


In [439]:
# min_samples_split = 10
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10, random_state=42)
model.fit(X_train_scaled, y_train)

In [440]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 18.72
MSE  : 856.67
RMSE : 29.27


In [443]:
# min_samples_leaf = 2
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10,  min_samples_leaf = 2, random_state=42)
model.fit(X_train_scaled, y_train)

In [445]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 18.72
MSE  : 856.67
RMSE : 29.27


In [447]:
# min_samples_leaf = 4
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10,  min_samples_leaf = 4, random_state=42)
model.fit(X_train_scaled, y_train)

In [449]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 18.72
MSE  : 856.67
RMSE : 29.27


In [453]:
# max_features = 'sqrt'
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10, max_features = 'sqrt', random_state=42)
model.fit(X_train_scaled, y_train)

In [455]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 21.17
MSE  : 998.84
RMSE : 31.60


In [457]:
# max_features = 'log2'
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10, max_features = 'log2', random_state=42)
model.fit(X_train_scaled, y_train)

In [459]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 22.53
MSE  : 1082.07
RMSE : 32.89


In [461]:
# bootstrap = False
model = RandomForestRegressor(n_estimators=500, max_depth =5, min_samples_split = 10, max_features = 'log2', bootstrap = False, random_state=42)
model.fit(X_train_scaled, y_train)

In [463]:
# evaluation du modèle 
y_pred = model.predict(X_test_scaled)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")

MAE  : 22.48
MSE  : 1078.28
RMSE : 32.84


### GridSearchCV

In [490]:
parameters = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


In [492]:
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator= model,
    param_grid=parameters,
    cv=3,  # jeu d'entraînement découpé en 3 parties
    scoring='neg_root_mean_squared_error',  # opposé du RMSE
    n_jobs=-1,  # utilise tous les CPU
)

grid_search.fit(X_train_scaled, y_train)

In [494]:
print("Meilleurs paramètres trouvés :", grid_search.best_params_)
print("Meilleur score (négatif RMSE) :", grid_search.best_score_)


Meilleurs paramètres trouvés : {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Meilleur score (négatif RMSE) : -24.238483245090517


In [496]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

print("RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE :", mean_absolute_error(y_test, y_pred))


RMSE : 32.982834833744015
MAE : 26.50806336476608


