In [169]:
import yfinance as yf
import matplotlib.pyplot as plt
import indicators
from indicators import *
import params
from params import *
import os
import importlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns

importlib.reload(params)
importlib.reload(indicators)

<module 'indicators' from '/Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/indicators.py'>

On créer la fonction d'extraction des données

In [170]:
def extract_data(ticker, name, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d")
    data = data.dropna()
    data = data[data["Volume"] > 0]
    data["VariationN"] = (data["Close"].shift(-1) / data["Close"] - 1) * 100
    data["VariationN-1"] = data["VariationN"].shift(1)
    data["Trend"] = (data["VariationN"] > 0).astype(int)
    data["Amplitude"] = data["High"] - data["Low"]


    if isinstance(data.columns, pd.MultiIndex):
        data.columns = data.columns.get_level_values(0)
    data["Date"] = data.index
    data = data.dropna()
    data.columns.name = None
    data.insert(0, 'Indice', name)

    return data

def create_base_clean(indices, start_date, end_date):
    data = pd.DataFrame()
    base_list = []
    for name, ticker in indices.items():
        base_list.append(extract_data(ticker, name, start_date, end_date))
        data = data.dropna()
    base_clean = pd.concat(base_list)
    base_clean = base_clean.reset_index(drop=True)

    return base_clean

In [171]:
def create_base_clean(indices, start_date, end_date):
    data = pd.DataFrame()
    base_list = []
    for name, ticker in indices.items():
        base_list.append(extract_data(ticker, name, start_date, end_date))
        data = data.dropna()
    base_clean = pd.concat(base_list)
    base_clean = base_clean.reset_index(drop=True)

    return base_clean

La fonction qui rajoute les indicateurs

In [172]:
def add_indicators(data, params_ma, params_ema, params_rsi, params_macd, params_std, params_bollinger):
    data = data.copy()

    for x in params_ma:
        col_name = "MA_" + str(x)
        data.loc[:, col_name] = MA(data["Log_Return"], x)

    for x in params_ema:
        col_name = "EMA_" + str(x)
        data.loc[:, col_name] = EMA(data["Log_Return"], x)

    for x in params_rsi:
        col_name = "RSI_" + str(x)
        data.loc[:, col_name] = RSI(data["Close"], x)

    for x in params_std:
        col_name = "STD_" + str(x)
        data.loc[:, col_name] = STD(data["Close"], x)

    data.loc[:, "MACD"], data.loc[:, "Signal"] = MACD(data["Log_Return"], params_macd[0], params_macd[1], params_macd[2])
    data.loc[:, "BollingerBandUpper"], data.loc[:, "BollingerBandLower"] = BollingerBands(data["Close"], params_bollinger[0], params_bollinger[1])

    return data

Création de la base_clean et de la base_indice

In [173]:
def base_normalise(data, norm_window):

    data["Log_Return"] = np.log(data["Close"] / data["Close"].shift(1))

    data["Close_Norm"] = data["Close"] / data["Close"].rolling(norm_window).mean() - 1

    data["Open_Norm"] = data["Open"] / data["Close"] - 1

    data["High_Norm"] = data["High"] / data["Close"] - 1

    data["Low_Norm"] = data["Low"] / data["Close"] - 1

    return data

In [174]:
if isimport:
    base_clean = create_base_clean(indices, start_date, end_date)

    file_path = base_dir + "/base_clean.csv"
    base_clean.to_csv(file_path)

else:
    file_path = base_dir + "/base_clean.csv"
    base_clean = pd.read_csv(file_path)

indice_list = base_clean["Indice"].unique()

base_list = []

for indice in indice_list:
    data = base_clean[base_clean["Indice"] == indice]
    data = base_normalise(data, norm_window)
    data = add_indicators(data, params_ma, params_ema, params_rsi, params_macd, params_std, params_bollinger)

    base_list.append(data)

base_indice = pd.concat(base_list)
base_indice = base_indice.dropna()
base_indice = base_indice.reset_index(drop=True)

file_path = base_dir + "/base_indice.csv"
base_indice.to_csv(file_path)


[*********************100%***********************]  1 of 1 completed


In [175]:
base_corr = base_indice.drop(["Date", "Indice"], axis=1)
correlation_matrix = base_corr.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")

file_path = plot_dir + "/correlation_heatmap.png"
plt.savefig(file_path, bbox_inches="tight", dpi=300)

plt.close()
print(f"✅ Heatmap enregistrée dans {file_path}")

✅ Heatmap enregistrée dans /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/plots/correlation_heatmap.png


Préparation pour entrainement

In [176]:
X = base_indice.drop(columns=["VariationN", "Indice", "Date", "Open", "Close", "High", "Low", "Volume", "Close_Norm", "Open_Norm", "High_Norm", "Low_Norm", "Log_Return"])
Y = base_indice["VariationN"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size)

Régression linéaire

In [177]:
model = LinearRegression()
model.fit(X_train, Y_train)

# Affichage des coefficients
coefficients_linéaire = pd.DataFrame(np.append(model.intercept_, model.coef_),
                                    index=["Intercept"] + list(X_train.columns),
                                    columns=["Coefficients (Sklearn)"])

# Prédictions avec Scikit-Learn
Y_pred = model.predict(X_test)


mse_standard = mean_squared_error(Y_test, Y_pred)
mae_standard = mean_absolute_error(Y_test, Y_pred)
r2_standard = r2_score(Y_test, Y_pred)

# Standardisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Réentraîner le modèle avec les données standardisées
model = LinearRegression()
model.fit(X_train_scaled, Y_train)  

coefficients_standardisés = pd.DataFrame(np.append(model.intercept_, model.coef_),
                                         index=["Intercept"] + list(X_train.columns),
                                         columns=["Coefficients Standardisés"])

Y_pred_scaled = model.predict(X_test_scaled)

mse_scaled = mean_squared_error(Y_test, Y_pred_scaled)
mae_scaled = mean_absolute_error(Y_test, Y_pred_scaled)
r2_scaled = r2_score(Y_test, Y_pred_scaled)


# Modèle Ridge
ridge_model = Ridge(alpha = 0.1)
ridge_model.fit(X_train_scaled, Y_train)

coefficients_ridge = pd.DataFrame(np.append(ridge_model.intercept_, ridge_model.coef_),
                                  index=["Intercept"] + list(X_train.columns),
                                  columns=["Coefficients (Ridge)"])

Y_pred_ridge = ridge_model.predict(X_test_scaled)

mse_ridge = mean_squared_error(Y_test, Y_pred_ridge)
mae_ridge = mean_absolute_error(Y_test, Y_pred_ridge)
r2_ridge = r2_score(Y_test, Y_pred_ridge)



# Modèle Lasso
lasso_model = Lasso(alpha = 0.001)
lasso_model.fit(X_train_scaled, Y_train)

coefficients_lasso = pd.DataFrame(np.append(lasso_model.intercept_, lasso_model.coef_),
                                  index=["Intercept"] + list(X_train.columns),
                                  columns=["Coefficients (Lasso)"])

Y_pred_lasso = lasso_model.predict(X_test_scaled)

mse_lasso = mean_squared_error(Y_test, Y_pred_lasso)
mae_lasso = mean_absolute_error(Y_test, Y_pred_lasso)
r2_lasso = r2_score(Y_test, Y_pred_lasso)


# Modèle ElasticNet
elasticnet_model = ElasticNet(alpha = 0.001, l1_ratio = 0.5)
elasticnet_model.fit(X_train_scaled, Y_train)

coefficients_elasticnet = pd.DataFrame(np.append(elasticnet_model.intercept_, elasticnet_model.coef_),
                                       index=["Intercept"] + list(X_train.columns),
                                       columns=["Coefficients (ElasticNet)"])

Y_pred_elasticnet = elasticnet_model.predict(X_test_scaled)

mse_elasticnet = mean_squared_error(Y_test, Y_pred_elasticnet)
mae_elasticnet = mean_absolute_error(Y_test, Y_pred_elasticnet)
r2_elasticnet = r2_score(Y_test, Y_pred_elasticnet)

# Comparaison des métriques
metrics_comparison = pd.DataFrame({
    "Modèle": [
        "Régression Linéaire",
        "Régression Linéaire Standardisée",
        "Régression Ridge",
        "Régression Lasso",
        "Régression ElasticNet"
    ],
    "MSE": [
        mse_standard,
        mse_scaled,
        mse_ridge,
        mse_lasso,
        mse_elasticnet
    ],
    "MAE": [
        mae_standard,
        mae_scaled,
        mae_ridge,
        mae_lasso,
        mae_elasticnet
    ],
    "R²": [
        r2_standard,
        r2_scaled,
        r2_ridge,
        r2_lasso,
        r2_elasticnet
    ]
})

print("📊 Comparaison des métriques :")
display(metrics_comparison)

# Comparaison des coefficients
coefficients_comparison = pd.concat(
    [
        coefficients_linéaire.rename(columns={"Coefficients (Sklearn)": "Régression Linéaire"}),
        coefficients_standardisés.rename(columns={"Coefficients Standardisés": "Régression Linéaire Standardisée"}),
        coefficients_ridge.rename(columns={"Coefficients (Ridge)": "Régression Ridge"}),
        coefficients_lasso.rename(columns={"Coefficients (Lasso)": "Régression Lasso"}),
        coefficients_elasticnet.rename(columns={"Coefficients (ElasticNet)": "Régression ElasticNet"})
    ],
    axis=1
)

print("📊 Comparaison des coefficients :")
display(coefficients_comparison)

📊 Comparaison des métriques :


Unnamed: 0,Modèle,MSE,MAE,R²
0,Régression Linéaire,0.537251,0.505369,0.46976
1,Régression Linéaire Standardisée,0.537251,0.505369,0.46976
2,Régression Ridge,0.537252,0.505367,0.46976
3,Régression Lasso,0.537027,0.50467,0.469981
4,Régression ElasticNet,0.537189,0.504877,0.469821


📊 Comparaison des coefficients :


Unnamed: 0,Régression Linéaire,Régression Linéaire Standardisée,Régression Ridge,Régression Lasso,Régression ElasticNet
Intercept,-0.778093,0.034032,0.034032,0.034032,0.034032
VariationN-1,0.011123,0.011574,0.011556,0.0,0.002348
Trend,1.39154,0.694452,0.694447,0.693606,0.693727
Amplitude,0.000461,0.00835,0.008351,0.004678,0.006758
MA_30,2.932504,0.005195,0.005146,-0.019331,-0.014851
EMA_5,-30.070054,-0.138759,-0.138574,-0.037772,-0.058898
RSI_14,0.001612,0.028326,0.028312,0.018213,0.021267
STD_10,0.001675,0.030933,0.030931,0.021018,0.025892
STD_20,-0.000141,-0.031257,-0.03125,-0.01611,-0.022536
MACD,76.087479,0.100235,0.100044,0.0,0.0205
