In [145]:
import yfinance as yf
import matplotlib as plt
from indicators import *
import os
import importlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

importlib.reload(indicators)
print(dir(indicators))

['BollingerBands', 'EMA', 'MA', 'MACD', 'RSI', 'STD', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'pandas', 'pd']


In [106]:
start_date = '1950-01-01'
end_date = '2025-01-01'

In [107]:
indices = {
    "SP500": "^GSPC",
    "Nasdaq100": "^NDX",
    "DowJones": "^DJI",
    "Russell2000": "^RUT",
    "FTSE100": "^FTSE",
    "DAX": "^GDAXI",
    "CAC40": "^FCHI",
    "Nikkei225": "^N225",
    "HangSeng": "^HSI",
    "EuroStoxx50": "^STOXX50E"
}

In [108]:
def extract_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d")

    data = data[data["Volume"] > 0]
    data["V_t"] = (data["Close"].diff() / data["Close"].shift()) * 100
    data["MA5"] = MA(data["Close"], 5)
    data["MA15"] = MA(data["Close"], 15)
    data["EMA5"] = EMA(data["Close"], 5)
    data["EMA15"] = EMA(data["Close"], 15)
    data["RSI14"] = RSI(data["Close"], 14)
    data["MACD"], data["Signal"] = MACD(data["Close"], 12, 26)
    data["STD10"] = STD(data["Close"], 10)
    data["STD20"] = STD(data["Close"], 20)
    data["BollingerBandUpper"], data["BollingerBandLower"] = BollingerBands(data["Close"], 20, 2)
    data = data.dropna()

    data = data.drop("Adj Close", axis=1)
    if isinstance(data.columns, pd.MultiIndex):
        # On garde seulement le premier niveau de header
        data.columns = data.columns.get_level_values(0)

    # On transforme la date en colonne
    data = data.reset_index()
    data["Date"] = pd.to_datetime(data["Date"]).dt.date

    return data

In [109]:
base_list = []

In [110]:
for name, ticker in indices.items():
    base_dir = os.path.join(os.getcwd(), "base")

    data = extract_data(ticker, start_date, end_date)

    data.insert(0, 'Indice', name)

    base_list.append(data)

    
    if data is not None:
        file_path = os.path.join(base_dir, f"{name}.csv")
        data.to_csv(file_path)
        print(f"✅ Fichier enregistré : {file_path}")

    BaseTotal = pd.concat(base_list)

    file_path = os.path.join(base_dir, "BaseTotal.csv")
    BaseTotal.to_csv(file_path)

[*********************100%***********************]  1 of 1 completed


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/SP500.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^NDX']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5ENDX?period1=-631134000&period2=1735707600&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x13f291910>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/Nasdaq100.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^DJI']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EDJI?period1=-631134000&period2=1735707600&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16e2c6390>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/DowJones.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^RUT']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5ERUT?period1=-631134000&period2=1735707600&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16e2ee210>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/Russell2000.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^FTSE']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EFTSE?period1=-631152000&period2=1735689600&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x17747a330>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/FTSE100.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^GDAXI']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EGDAXI?period1=-631155600&period2=1735686000&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x176fde120>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/DAX.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^FCHI']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EFCHI?period1=-631155600&period2=1735686000&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16e2efe60>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/CAC40.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^N225']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EN225?period1=-631184400&period2=1735657200&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x167930920>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/Nikkei225.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^HSI']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5EHSI?period1=-631180800&period2=1735660800&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16e2c57c0>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/HangSeng.csv


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['^STOXX50E']: ConnectionError(MaxRetryError('HTTPSConnectionPool(host=\'query2.finance.yahoo.com\', port=443): Max retries exceeded with url: /v8/finance/chart/%5ESTOXX50E?period1=-631155600&period2=1735686000&interval=1d&includePrePost=False&events=div%2Csplits%2CcapitalGains&crumb=CKV.zkgRvwc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16e2edfa0>: Failed to resolve \'query2.finance.yahoo.com\' ([Errno 8] nodename nor servname provided, or not known)"))'))


✅ Fichier enregistré : /Users/romaindesmeulemester/ProjetMachineLearning/ProjetMachineLearning/base/EuroStoxx50.csv


Régression linéaire

In [139]:
BaseTotal = BaseTotal.reset_index(drop=True)

# Séparation des features et de la cible
X = BaseTotal.drop(columns=["V_t", "Indice", "Date"])
Y = BaseTotal["V_t"]

# Division en train et test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Création et entraînement du modèle
model = LinearRegression()
model.fit(X_train, Y_train)

# Affichage des coefficients
coefficients_linéaire = pd.DataFrame(np.append(model.intercept_, model.coef_),
                                    index=["Intercept"] + list(X_train.columns),
                                    columns=["Coefficients (Sklearn)"])

# Prédictions avec Scikit-Learn
Y_pred = model.predict(X_test)

# Affichage des premières valeurs réelles vs prédites
resultats = pd.DataFrame({"Valeur réelle": Y_test.values, "Prédiction": Y_pred})
display(resultats.head(10))

mse = mean_squared_error(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"MSE : {mse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R² : {r2:.4f}")

Unnamed: 0,Valeur réelle,Prédiction
0,0.511033,0.275155
1,-0.1019,0.193339
2,-2.9537,-0.522204
3,0.209032,0.331738
4,1.171289,0.632279
5,-0.93278,-0.295923
6,-0.34814,-0.180649
7,0.516013,0.638766
8,0.438335,-0.109275
9,-0.008946,0.00957


MSE : 0.5534
MAE : 0.4804
R² : 0.4256


In [143]:
# Ajout de la colonne de biais (intercept)
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Calcul des coefficients via OLS
theta_best = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ Y_train

# Affichage des coefficients
coefficients_OLS = pd.DataFrame(theta_best, index=["Intercept"] + list(X_train.columns), columns=["Coefficients"])

# Prédiction sur l'ensemble de test
Y_pred_OLS = X_test_b @ theta_best

# Affichage des premières prédictions vs vraies valeurs
resultats_OLS = pd.DataFrame({"Valeur réelle": Y_test.values, "Prédiction OLS": Y_pred_OLS.flatten()})

display(resultats_OLS.head(10))

# Calcul des métriques d'évaluation
mse_OLS = mean_squared_error(Y_test, Y_pred_OLS)
mae_OLS = mean_absolute_error(Y_test, Y_pred_OLS)
r2_OLS = r2_score(Y_test, Y_pred_OLS)

# Affichage des résultats
print(f"📊 Performance du modèle OLS :")
print(f"MSE (Erreur quadratique moyenne) : {mse_OLS:.4f}")
print(f"MAE (Erreur absolue moyenne) : {mae_OLS:.4f}")
print(f"R² Score : {r2_OLS:.4f}")

Unnamed: 0,Valeur réelle,Prédiction OLS
0,0.511033,0.273235
1,-0.1019,0.191438
2,-2.9537,-0.52236
3,0.209032,0.330777
4,1.171289,0.630117
5,-0.93278,-0.295759
6,-0.34814,-0.180728
7,0.516013,0.635911
8,0.438335,-0.109304
9,-0.008946,0.008734


📊 Performance du modèle OLS :
MSE (Erreur quadratique moyenne) : 0.5535
MAE (Erreur absolue moyenne) : 0.4804
R² Score : 0.4255


In [147]:
# 🔹 1. Préparation des données
BaseTotal = BaseTotal.reset_index(drop=True)

# Séparation des features et de la cible
X = BaseTotal.drop(columns=["V_t", "Indice", "Date"])  # Features
Y = BaseTotal["V_t"]  # Cible

# Division en train et test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Standardisation des features pour éviter l'explosion des valeurs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)

# Ajout de la colonne de biais (intercept)
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

# 🔹 2. Initialisation des poids (paramètres)
n_features = X_train_b.shape[1]  # Nombre de variables + biais
theta = np.random.randn(n_features, 1) * 0.01  # Petites valeurs pour éviter la divergence

# Conversion de Y_train en matrice NumPy
Y_train_np = Y_train.values.reshape(-1, 1)

# 🔹 3. Descente de gradient
alpha = 0.01  # Taux d'apprentissage
n_iterations = 5000  # Nombre d'itérations
m = X_train_b.shape[0]  # Nombre d'exemples
losses = []  # Liste pour stocker l'évolution de la perte

for iteration in range(n_iterations):
    # Calcul des prédictions
    Y_pred_train = X_train_b @ theta  
    
    # Calcul des erreurs
    errors = Y_pred_train - Y_train_np  
    
    # Calcul des gradients
    gradients = (2/m) * X_train_b.T @ errors  
    
    # Mise à jour des poids
    theta -= alpha * gradients  
    
    # Calcul de la perte (MSE)
    mse = np.mean(errors**2)
    losses.append(mse)
    
    # Arrêt précoce si la perte devient stable
    if iteration > 1 and abs(losses[-1] - losses[-2]) < 1e-10:
        print(f"✅ Convergence atteinte à l'itération {iteration}")
        break

# 🔹 4. Prédiction avec le modèle entraîné
Y_pred_GD = X_test_b @ theta  

# 🔹 5. Évaluation du modèle
mse_GD = mean_squared_error(Y_test, Y_pred_GD)
mae_GD = mean_absolute_error(Y_test, Y_pred_GD)
r2_GD = r2_score(Y_test, Y_pred_GD)

print(f"\n📊 Performance du modèle (Descente de Gradient) :")
print(f"MSE : {mse_GD:.4f}")
print(f"MAE : {mae_GD:.4f}")
print(f"R² : {r2_GD:.4f}")

# 🔹 Affichage des coefficients trouvés
coefficients_GD = pd.DataFrame(theta, index=["Intercept"] + list(X_train.columns), columns=["Coefficients (GD)"])
display(coefficients_GD)

resultats_GD = pd.DataFrame({"Valeur réelle": Y_test.values, "Prédiction GD": Y_pred_GD.flatten()})
display(resultats_GD.head(10))


📊 Performance du modèle (Descente de Gradient) :
MSE : 0.9189
MAE : 0.6364
R² : 0.0462


Unnamed: 0,Coefficients (GD)
Intercept,0.03454
Close,0.456194
High,0.094865
Low,0.126505
Open,-0.238394
Volume,-0.024382
MA5,-0.117899
MA15,-0.064718
EMA5,-0.030761
EMA15,-0.08455


Unnamed: 0,Valeur réelle,Prédiction GD
0,0.511033,-0.0543
1,-0.1019,0.081085
2,-2.9537,-0.248613
3,0.209032,0.414975
4,1.171289,0.134744
5,-0.93278,-0.373194
6,-0.34814,-0.236436
7,0.516013,0.038935
8,0.438335,-0.259218
9,-0.008946,-0.067578
