In [350]:
import yfinance as yf
import matplotlib.pyplot as plt
import indicators
from indicators import *
import params
from params import *
import os
import importlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns

importlib.reload(params)
importlib.reload(indicators)

<module 'indicators' from '/Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/indicators.py'>

On créer la fonction d'extraction des données

In [351]:
def extract_data(ticker, name, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d")
    data = data.dropna()
    data = data[data["Volume"] > 0]
    data["VariationN"] = (data["Close"].shift(-1) / data["Close"] - 1) * 100
    data["VariationN-1"] = data["VariationN"].shift(1)
    data["Trend"] = (data["VariationN-1"] > 0).astype(int)
    data["Amplitude"] = data["High"] - data["Low"]


    if isinstance(data.columns, pd.MultiIndex):
        data.columns = data.columns.get_level_values(0)
    data["Date"] = data.index
    data = data.dropna()
    data.columns.name = None
    data.insert(0, 'Indice', name)

    return data

def create_base_clean(indices, start_date, end_date):
    data = pd.DataFrame()
    base_list = []
    for name, ticker in indices.items():
        base_list.append(extract_data(ticker, name, start_date, end_date))
        data = data.dropna()
    base_clean = pd.concat(base_list)
    base_clean = base_clean.reset_index(drop=True)

    return base_clean

In [352]:
def create_base_clean(indices, start_date, end_date):
    data = pd.DataFrame()
    base_list = []
    for name, ticker in indices.items():
        base_list.append(extract_data(ticker, name, start_date, end_date))
        data = data.dropna()
    base_clean = pd.concat(base_list)
    base_clean = base_clean.reset_index(drop=True)

    return base_clean

La fonction qui rajoute les indicateurs

In [353]:
def add_indicators(data, params_ma, params_ema, params_rsi, params_macd, params_std, params_bollinger):
    data = data.copy()

    for x in params_ma:
        col_name = "MA_" + str(x)
        data.loc[:, col_name] = MA(data["Log_Return"], x)

    for x in params_ema:
        col_name = "EMA_" + str(x)
        data.loc[:, col_name] = EMA(data["Log_Return"], x)

    for x in params_rsi:
        col_name = "RSI_" + str(x)
        data.loc[:, col_name] = RSI(data["Close"], x)

    for x in params_std:
        col_name = "STD_" + str(x)
        data.loc[:, col_name] = STD(data["Close"], x)

    data.loc[:, "MACD"], data.loc[:, "Signal"] = MACD(data["Log_Return"], params_macd[0], params_macd[1], params_macd[2])
    data.loc[:, "BollingerBandUpper"], data.loc[:, "BollingerBandLower"] = BollingerBands(data["Close"], params_bollinger[0], params_bollinger[1])

    return data

Création de la base_clean et de la base_indice

In [354]:
def base_normalise(data, norm_window):

    data["Log_Return"] = np.log(data["Close"] / data["Close"].shift(1))

    data["Close_Norm"] = data["Close"] / data["Close"].rolling(norm_window).mean() - 1

    data["Open_Norm"] = data["Open"] / data["Close"] - 1

    data["High_Norm"] = data["High"] / data["Close"] - 1

    data["Low_Norm"] = data["Low"] / data["Close"] - 1

    return data

In [355]:
if isimport:
    base_clean = create_base_clean(indices, start_date, end_date)

    file_path = base_dir + "/base_clean.csv"
    base_clean.to_csv(file_path)

else:
    file_path = base_dir + "/base_clean.csv"
    base_clean = pd.read_csv(file_path)

indice_list = base_clean["Indice"].unique()

base_list = []

for indice in indice_list:
    data = base_clean[base_clean["Indice"] == indice]
    data = base_normalise(data, norm_window)
    data = add_indicators(data, params_ma, params_ema, params_rsi, params_macd, params_std, params_bollinger)

    base_list.append(data)

base_indice = pd.concat(base_list)
base_indice = base_indice.dropna()
base_indice = base_indice.reset_index(drop=True)

file_path = base_dir + "/base_indice.csv"
base_indice.to_csv(file_path)


[*********************100%***********************]  1 of 1 completed


In [356]:
base_corr = base_indice.drop(["Date", "Indice"], axis=1)
correlation_matrix = base_corr.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")

file_path = plot_dir + "/correlation_heatmap.png"
plt.savefig(file_path, bbox_inches="tight", dpi=300)

plt.close()
print(f"✅ Heatmap enregistrée dans {file_path}")

✅ Heatmap enregistrée dans /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/plots/correlation_heatmap.png


Préparation pour entrainement

In [357]:
X = base_indice.drop(columns=["VariationN", "Indice", "Date", "Open", "Close", "High", "Low", "Volume", "Close_Norm", "Open_Norm", "High_Norm", "Low_Norm", "Log_Return"])
Y = base_indice["VariationN"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)

Régression linéaire

In [358]:
model = LinearRegression()
model.fit(X_train, Y_train)

# Affichage des coefficients
coefficients_linéaire = pd.DataFrame(np.append(model.intercept_, model.coef_),
                                    index=["Intercept"] + list(X_train.columns),
                                    columns=["Coefficients (Sklearn)"])

# Prédictions avec Scikit-Learn
Y_pred = model.predict(X_test)


mse_standard = mean_squared_error(Y_test, Y_pred)
mae_standard = mean_absolute_error(Y_test, Y_pred)
r2_standard = r2_score(Y_test, Y_pred)

# Standardisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Réentraîner le modèle avec les données standardisées
model = LinearRegression()
model.fit(X_train_scaled, Y_train)  

coefficients_standardisés = pd.DataFrame(np.append(model.intercept_, model.coef_),
                                         index=["Intercept"] + list(X_train.columns),
                                         columns=["Coefficients Standardisés"])

Y_pred_scaled = model.predict(X_test_scaled)

mse_scaled = mean_squared_error(Y_test, Y_pred_scaled)
mae_scaled = mean_absolute_error(Y_test, Y_pred_scaled)
r2_scaled = r2_score(Y_test, Y_pred_scaled)


# Modèle Ridge
ridge_model = Ridge(alpha = params_ridge[0])
ridge_model.fit(X_train_scaled, Y_train)

coefficients_ridge = pd.DataFrame(np.append(ridge_model.intercept_, ridge_model.coef_),
                                  index=["Intercept"] + list(X_train.columns),
                                  columns=["Coefficients (Ridge)"])

Y_pred_ridge = ridge_model.predict(X_test_scaled)

mse_ridge = mean_squared_error(Y_test, Y_pred_ridge)
mae_ridge = mean_absolute_error(Y_test, Y_pred_ridge)
r2_ridge = r2_score(Y_test, Y_pred_ridge)



# Modèle Lasso
lasso_model = Lasso(alpha = params_lasso[0])
lasso_model.fit(X_train_scaled, Y_train)

coefficients_lasso = pd.DataFrame(np.append(lasso_model.intercept_, lasso_model.coef_),
                                  index=["Intercept"] + list(X_train.columns),
                                  columns=["Coefficients (Lasso)"])

Y_pred_lasso = lasso_model.predict(X_test_scaled)

mse_lasso = mean_squared_error(Y_test, Y_pred_lasso)
mae_lasso = mean_absolute_error(Y_test, Y_pred_lasso)
r2_lasso = r2_score(Y_test, Y_pred_lasso)


# Modèle ElasticNet
elasticnet_model = ElasticNet(alpha = params_elasticnet[0], l1_ratio = params_elasticnet[1])
elasticnet_model.fit(X_train_scaled, Y_train)

coefficients_elasticnet = pd.DataFrame(np.append(elasticnet_model.intercept_, elasticnet_model.coef_),
                                       index=["Intercept"] + list(X_train.columns),
                                       columns=["Coefficients (ElasticNet)"])

Y_pred_elasticnet = elasticnet_model.predict(X_test_scaled)

mse_elasticnet = mean_squared_error(Y_test, Y_pred_elasticnet)
mae_elasticnet = mean_absolute_error(Y_test, Y_pred_elasticnet)
r2_elasticnet = r2_score(Y_test, Y_pred_elasticnet)

def positif_rate(Y_test, Y_pred):
    return (np.sign(Y_test) == np.sign(Y_pred)).mean()

positif_rate_standard = positif_rate(Y_test, Y_pred)
positif_rate_scaled = positif_rate(Y_test, Y_pred_scaled)
positif_rate_ridge = positif_rate(Y_test, Y_pred_ridge)
positif_rate_lasso = positif_rate(Y_test, Y_pred_lasso)
positif_rate_elasticnet = positif_rate(Y_test, Y_pred_elasticnet)


# Comparaison des métriques
metrics_comparison = pd.DataFrame({
    "Modèle": [
        "Régression Linéaire",
        "Régression Linéaire Standardisée",
        "Régression Ridge",
        "Régression Lasso",
        "Régression ElasticNet"
    ],
    "MSE": [
        mse_standard,
        mse_scaled,
        mse_ridge,
        mse_lasso,
        mse_elasticnet
    ],
    "MAE": [
        mae_standard,
        mae_scaled,
        mae_ridge,
        mae_lasso,
        mae_elasticnet
    ],
    "R²": [
        r2_standard,
        r2_scaled,
        r2_ridge,
        r2_lasso,
        r2_elasticnet
    ],
    "Positif Rate": [
        positif_rate_standard,
        positif_rate_scaled,
        positif_rate_ridge,
        positif_rate_lasso,
        positif_rate_elasticnet
    ]
})

print("📊 Comparaison des métriques :")
display(metrics_comparison)

# Comparaison des coefficients
coefficients_comparison = pd.concat(
    [
        coefficients_linéaire.rename(columns={"Coefficients (Sklearn)": "Régression Linéaire"}),
        coefficients_standardisés.rename(columns={"Coefficients Standardisés": "Régression Linéaire Standardisée"}),
        coefficients_ridge.rename(columns={"Coefficients (Ridge)": "Régression Ridge"}),
        coefficients_lasso.rename(columns={"Coefficients (Lasso)": "Régression Lasso"}),
        coefficients_elasticnet.rename(columns={"Coefficients (ElasticNet)": "Régression ElasticNet"})
    ],
    axis=1
)

print("📊 Comparaison des coefficients :")
display(coefficients_comparison)

# Comparaison des prédictions
predictions_comparison = pd.DataFrame({
    "Y_test": Y_test,
    "Régression Linéaire": Y_pred,
    "Régression Linéaire Standardisée": Y_pred_scaled,
    "Régression Ridge": Y_pred_ridge,
    "Régression Lasso": Y_pred_lasso,
    "Régression ElasticNet": Y_pred_elasticnet
})

print("📊 Comparaison des prédictions :")
display(predictions_comparison)

# Comparaison des erreurs
# Concaténation horizontale de Y_test et des prédictions
predictions = pd.concat(
    [
        Y_test.rename("Y_test"),
        pd.Series(Y_pred, index=Y_test.index, name="Régression Linéaire"),
        pd.Series(Y_pred_scaled, index=Y_test.index, name="Régression Linéaire Standardisée"),
        pd.Series(Y_pred_ridge, index=Y_test.index, name="Régression Ridge"),
        pd.Series(Y_pred_lasso, index=Y_test.index, name="Régression Lasso"),
        pd.Series(Y_pred_elasticnet, index=Y_test.index, name="Régression ElasticNet")
    ],
    axis=1
)

# Filtrer les cas où le signe de Y_test est différent du signe des prédictions
filtered_predictions = predictions[
    (np.sign(predictions["Y_test"]) != np.sign(predictions["Régression Linéaire"])) |
    (np.sign(predictions["Y_test"]) != np.sign(predictions["Régression Linéaire Standardisée"])) |
    (np.sign(predictions["Y_test"]) != np.sign(predictions["Régression Ridge"])) |
    (np.sign(predictions["Y_test"]) != np.sign(predictions["Régression Lasso"])) |
    (np.sign(predictions["Y_test"]) != np.sign(predictions["Régression ElasticNet"]))
]

filtered_predictions[filtered_predictions["Y_test"] != 0]

📊 Comparaison des métriques :


Unnamed: 0,Modèle,MSE,MAE,R²,Positif Rate
0,Régression Linéaire,1.208939,0.746128,0.003561,0.516881
1,Régression Linéaire Standardisée,1.208939,0.746128,0.003561,0.516881
2,Régression Ridge,1.208939,0.746128,0.003561,0.516881
3,Régression Lasso,1.209085,0.745916,0.00344,0.518307
4,Régression ElasticNet,1.209231,0.746073,0.003319,0.518307


📊 Comparaison des coefficients :


Unnamed: 0,Régression Linéaire,Régression Linéaire Standardisée,Régression Ridge,Régression Lasso,Régression ElasticNet
Intercept,-0.1442,0.038614,0.038614,0.038614,0.038614
VariationN-1,-0.077198,-0.087407,-0.087407,-0.086028,-0.087527
Trend,0.110024,0.054866,0.054866,0.052724,0.053811
Amplitude,-0.001685,-0.03336,-0.03336,-0.02944,-0.03153
MA_30,-12.432426,-0.022266,-0.022267,-0.025735,-0.026726
EMA_5,-12.835944,-0.061521,-0.061521,-0.039993,-0.039676
RSI_14,0.002622,0.042326,0.042326,0.035782,0.038007
STD_10,0.001599,0.032288,0.032288,0.024936,0.027828
STD_20,-1.2e-05,-0.00293,-0.002929,0.0,-0.0
MACD,17.209844,0.023318,0.023317,-0.0,-0.0


📊 Comparaison des prédictions :


Unnamed: 0,Y_test,Régression Linéaire,Régression Linéaire Standardisée,Régression Ridge,Régression Lasso,Régression ElasticNet
2180,1.647032,0.030126,0.030126,0.030126,0.033029,0.032296
3694,0.438381,0.108746,0.108746,0.108746,0.110592,0.110073
503,0.210482,0.081261,0.081261,0.081261,0.078214,0.080233
5359,-0.329111,0.087877,0.087877,0.087877,0.087201,0.088142
6659,1.042223,0.009074,0.009074,0.009074,0.005300,0.007943
...,...,...,...,...,...,...
6016,-0.085756,0.007402,0.007402,0.007402,0.006084,0.005764
8640,0.292317,0.032007,0.032007,0.032007,0.029031,0.029358
1873,-0.571162,0.169919,0.169919,0.169919,0.165118,0.167150
2478,0.527725,0.009335,0.009335,0.009335,0.005520,0.007412


Unnamed: 0,Y_test,Régression Linéaire,Régression Linéaire Standardisée,Régression Ridge,Régression Lasso,Régression ElasticNet
5359,-0.329111,0.087877,0.087877,0.087877,0.087201,0.088142
3118,-0.166079,0.031934,0.031934,0.031934,0.035303,0.034353
4747,-0.597238,0.078398,0.078398,0.078398,0.077543,0.076668
6273,-2.708970,0.055125,0.055125,0.055125,0.055216,0.053923
4519,0.645236,-0.028461,-0.028461,-0.028461,-0.023640,-0.025902
...,...,...,...,...,...,...
7133,-0.323397,0.136249,0.136249,0.136249,0.141513,0.138664
3409,-0.305710,0.055607,0.055607,0.055607,0.053200,0.054125
9324,3.057259,-0.015096,-0.015096,-0.015096,-0.014445,-0.012898
6016,-0.085756,0.007402,0.007402,0.007402,0.006084,0.005764
