# Ressources

In [418]:
import pandas as pd

# Utilisation de yahoo finance api pour importer les daily stock et les index de prix
import yfinance as yf # https://pypi.org/project/yfinance/ # Il y a plein de fonctions sympa (.actions, .dividends, .splits ...)
import plotly.express as px


# fix pour plotly express et Visual Studio Code
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# Data

In [419]:
# LEs index sont associés à plusieurs ticker symboles .. ^GSPC, INX, MSFT, $SPX ....

# Nous, on veut le CAC 40 (on est en France :p), le code est ^FCHI

cac40 = yf.Ticker("^FCHI") # https://fr.finance.yahoo.com/quote/%5EFCHI?p=%5EFCHI
cac40

yfinance.Ticker object <^FCHI>

In [420]:
# Maintenant on a besoin d'extraire l'historique...
cac40 = cac40.history(period="max") # max permet d'avoir l'historique entier
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,0,0,0
1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,0,0,0
1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,0,0,0
1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,0,0,0
1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,0,0,0
...,...,...,...,...,...,...,...
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300,0,0
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000,0,0
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900,0,0
2022-06-02,6447.370117,6506.600098,6447.299805,6500.439941,57959900,0,0


Open = Prix à l'ouverture du marché
High = Prix le plus haut sur la journée
Low = Prix le plus bas sur la journée
Close = Prix à la fermeture du marché
Volume = Total du volume échangé ce jour
Dividends = ?
Stocks splits = ?

Le but est de prédire si cela va augmenter ou diminuer le jour suivant.
Dans ce but, Dividends et Stocks splits ne seront pas utiles.

# Visualisation

In [421]:
fig = px.line(cac40, y='Close', x=cac40.index, title="Evolution des prix à la fermeture des marchés depuis 1990")
fig.show()

Dommage qu'on n'ait pas acheté en 1995.... :P

# Suppression des colonnes inutiles

In [422]:
cac40.drop(['Dividends', 'Stock Splits'], axis=1, inplace=True)
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,0
1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,0
1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,0
1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,0
1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,0
...,...,...,...,...,...
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900
2022-06-02,6447.370117,6506.600098,6447.299805,6500.439941,57959900


# Features Engineering
## Colonne tomorrow

In [423]:
cac40["Tomorrow"] = cac40['Close'].shift(-1) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html # Permet de récupérer la valeur de la ligne et de le reporter 1 ligne précédente (-1)
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,0,1860.000000
1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,0,1874.000000
1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,0,1872.000000
1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,0,1880.000000
1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,0,1917.000000
...,...,...,...,...,...,...
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300,6468.799805
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000,6418.890137
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900,6500.439941
2022-06-02,6447.370117,6506.600098,6447.299805,6500.439941,57959900,6485.299805


## Variable Target

In [424]:
cac40['Target'] = (cac40['Tomorrow'] > cac40['Close']).astype(int) # si le cours à la fermeture de demain est plus élevé qu'à la fermeture d'aujourd'hui => 1 / Sinon 0
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,0,1860.000000,1
1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,0,1874.000000,1
1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,0,1872.000000,0
1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,0,1880.000000,1
1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,0,1917.000000,1
...,...,...,...,...,...,...,...
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300,6468.799805,0
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000,6418.890137,0
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900,6500.439941,1
2022-06-02,6447.370117,6506.600098,6447.299805,6500.439941,57959900,6485.299805,0


<span style="color: #F00000">Si les données étaient plus vieilles, on aurait pu en supprimer... Là, seulement une période de 30 ans, ça me parait cohérent.</span>

# Predictions
## Split la data

In [425]:
from sklearn.model_selection import train_test_split

# Choix des predicteurs

predictors = ['Close', 'Volume', 'Open', 'High', 'Low'] # On ne peut pas savoir tomorrow à l'avance.

x_train, x_test, y_train, y_test = train_test_split(cac40[predictors],
                                                    cac40['Target'],
                                                    test_size=0.33,
                                                    random_state=0) 

## RandomForestClassifier

In [426]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=0)

rf.fit(x_train, y_train)

RandomForestClassifier(min_samples_split=100, random_state=0)

## Scoring

In [427]:
# Score classique
rf.score(x_test, y_test)

0.5107327905255367

In [428]:
from sklearn.metrics import precision_score
# The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
# The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

#Precision_score
preds = rf.predict(x_test)
preds = pd.Series(preds, index=x_test.index)
preds

Date
1993-01-29    0
2009-02-12    1
1994-01-04    1
2021-07-14    0
2010-01-07    0
             ..
2009-10-01    0
1999-12-23    0
2018-07-03    1
1995-07-20    1
2004-12-06    0
Length: 2702, dtype: int32

In [429]:
precision_score(y_test, preds)

0.5316877918612408

# Piste d'amélioration du modèle
## Moyennes mobiles

In [430]:
periodes = [2,5,60,250,1000]
new_predictors = []
# Nous allons calculer la moyenne mobile du prix de fermeture des deux derniers jours, des 5 derniers jours (1 semaine), des trois derniers mois (60), des 250 derniers jours (1 an) et 1000 derniers jours (4 ans)
# Ne pas oublier qu'une semaine ici dure 5j
# Nous allons ensuite calculer le ratio entre les prix de fermeture de ces périodes

for periode in periodes:
    moyenne_mobile = cac40.rolling(periode).mean() # right par défault ici, donc on inclut la ligne où on est.
    
    ratio = f"Close_Ratio_Day{periode}"
    cac40[ratio] = cac40['Close'] / moyenne_mobile['Close']
    
    
    tendance_column = f"Tendance_Day{periode}"
    cac40[tendance_column] = cac40.shift(1).rolling(periode).sum()['Target'] # Moyenne mobile de la cible... ca permet de voir si ça augmente récemment ou pas.
    # Pour la tendance, c'est différent car avec le shift, on ne prend pas le jour actuel. On est obligé sinon on a du data leakage
    
    new_predictors += [ratio, tendance_column]

In [431]:
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target,Close_Ratio_Day2,Tendance_Day2,Close_Ratio_Day5,Tendance_Day5,Close_Ratio_Day60,Tendance_Day60,Close_Ratio_Day250,Tendance_Day250,Close_Ratio_Day1000,Tendance_Day1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,0,1860.000000,1,,,,,,,,,,
1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,0,1874.000000,1,1.007584,,,,,,,,,
1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,0,1872.000000,0,1.003749,2.0,,,,,,,,
1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,0,1880.000000,1,0.999466,1.0,,,,,,,,
1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,0,1917.000000,1,1.002132,1.0,1.008800,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300,6468.799805,0,1.003566,2.0,1.024077,4.0,1.018004,31.0,0.976968,133.0,1.150121,544.0
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000,6418.890137,0,0.992818,1.0,1.002723,4.0,1.002431,31.0,0.963110,132.0,1.133499,543.0
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900,6500.439941,1,0.996127,0.0,0.991291,3.0,0.993576,31.0,0.955792,131.0,1.124548,542.0
2022-06-02,6447.370117,6506.600098,6447.299805,6500.439941,57959900,6485.299805,0,1.006312,1.0,1.001107,3.0,1.004806,32.0,0.968015,131.0,1.138615,542.0


Bon, il est normal d'avoir des NaN... Par exemple, on ne peut pas faire la moyenne mobile des deux jours précédents si on a pas deux jours précédents...

In [432]:
cac40.dropna(inplace=True)
cac40

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target,Close_Ratio_Day2,Tendance_Day2,Close_Ratio_Day5,Tendance_Day5,Close_Ratio_Day60,Tendance_Day60,Close_Ratio_Day250,Tendance_Day250,Close_Ratio_Day1000,Tendance_Day1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1994-03-04,2166.100098,2178.699951,2155.899902,2178.699951,0,2219.899902,1,1.007864,1.0,0.995422,2.0,0.965145,31.0,1.041699,126.0,1.155651,499.0
1994-03-07,2199.100098,2225.500000,2196.699951,2219.899902,0,2216.399902,0,1.009367,2.0,1.012312,3.0,0.983238,32.0,1.060961,126.0,1.177280,499.0
1994-03-08,2216.800049,2224.899902,2206.300049,2216.399902,0,2199.699951,0,0.999211,1.0,1.012721,2.0,0.981542,32.0,1.058869,125.0,1.175211,498.0
1994-03-09,2211.800049,2213.899902,2191.600098,2199.699951,0,2184.600098,0,0.996218,0.0,1.003568,2.0,0.973836,32.0,1.050475,125.0,1.166153,498.0
1994-03-10,2209.000000,2217.600098,2180.800049,2184.600098,0,2175.000000,0,0.996556,0.0,0.993063,2.0,0.966994,31.0,1.042874,125.0,1.157961,497.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-27,6445.250000,6519.729980,6424.850098,6515.750000,70892100,6562.390137,1,1.008136,2.0,1.023303,4.0,1.011250,30.0,0.970034,132.0,1.142195,544.0
2022-05-30,6547.859863,6582.049805,6536.770020,6562.390137,69803300,6468.799805,0,1.003566,2.0,1.024077,4.0,1.018004,31.0,0.976968,133.0,1.150121,544.0
2022-05-31,6539.859863,6543.419922,6457.529785,6468.799805,162340000,6418.890137,0,0.992818,1.0,1.002723,4.0,1.002431,31.0,0.963110,132.0,1.133499,543.0
2022-06-01,6509.189941,6510.259766,6414.220215,6418.890137,74750900,6500.439941,1,0.996127,0.0,0.991291,3.0,0.993576,31.0,0.955792,131.0,1.124548,542.0


# Prediction du modèle amélioré
## Split la data

In [433]:
x_train, x_test, y_train, y_test = train_test_split(cac40[new_predictors],
                                                    cac40['Target'],
                                                    test_size=0.33,
                                                    random_state=0) 

## RandomForestClassifier

In [434]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=0)
rf.fit(x_train, y_train)


RandomForestClassifier(min_samples_split=100, random_state=0)

## Scoring

In [435]:
rf.score(x_test, y_test)


0.5012647554806071

In [436]:
preds = rf.predict(x_test)
preds = pd.Series(preds, index=x_test.index)
precision_score(y_test, preds)

0.5215453194650818

Pas vraiment convaincant...
# Hyperparamètres

In [437]:
import optuna

Optuna

In [438]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100,2000)
    min_samples_split = trial.suggest_int('min_samples_split', 50, 2000, step=50)
    random_state = trial.suggest_int('random_state', 0, 42)
    
    rf = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=random_state)
    rf.fit(x_train, y_train)
    score = rf.score(x_test, y_test)
    return score


study = optuna.create_study(direction="maximize", study_name="optimisation_rf", storage='sqlite:///./optuna/optimisation.db', load_if_exists=True)
study.optimize(objective, n_trials=50, n_jobs=-1)
    

[32m[I 2022-06-05 00:37:58,840][0m Using an existing study with name 'optimisation_rf' instead of creating a new one.[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.

[32m[I 2022-06-05 00:38:02,628][0m Trial 81 finished with value: 0.5282462057335582 and parameters: {'n_estimators': 101, 'min_samples_split': 1600, 'random_state': 19}. Best is trial 77 with value: 0.5396290050590219.[0m
[32m[I 2022-06-05 00:38:16,308][0m Trial 84 finished with value: 0.5299325463743676 and parameters: {'n_estimators': 1791, 'min_samples_split': 1600, 'random_state': 35}. Best is trial 77 with value: 0.5396290050590219.[0m
[32m[I 2022-06-05 00:38:17,552][0m Trial 82 finished with value: 0.531618887015177 and parameters: {'n_estimators': 1803, 'min_samples_split': 1400, 'random_state': 19}. Best is trial 77 with value: 0.5396290050590219.[0m
[32m[I 2022-06-05 00:38:17,758][0m Trial 79 finis

## Résultats

In [439]:
trial_optuna = study.best_trial
print(f'Meilleur score : {trial_optuna.value}')
print(f'Meilleurs hyperparamètres : {trial_optuna.params}')


Meilleur score : 0.5417369308600337
Meilleurs hyperparamètres : {'min_samples_split': 1450, 'n_estimators': 1566, 'random_state': 20}


In [440]:
optuna.visualization.plot_optimization_history(study)

## Predictions avec Optuna

In [441]:
rf = RandomForestClassifier(**trial_optuna.params)

In [442]:
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.5417369308600337