<a href="https://colab.research.google.com/github/VirgileH24/DataScience_Lessons/blob/main/TP_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px

import pandas_datareader.data as web
import datetime

# Import libraries Keras 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [7]:
start_day = 3
start_month = 1
start_year = 2017
start_date = datetime.datetime(start_year,start_month,start_day)


end_day = 26
end_month = 2
end_year = 2021
end_date = datetime.datetime(end_year,end_month,end_day)

print(start_date,end_date)

2017-01-03 00:00:00 2021-02-26 00:00:00


In [8]:
# load data
Sanofi_data = web.DataReader("SAN.PA", 'yahoo', start_date, end_date)


In [9]:
Sanofi_data  = Sanofi_data.rename_axis('Date').reset_index()


In [10]:
plt.figure(figsize=(20,10))
px.line(Sanofi_data,x="Date", y="Close")

<Figure size 1440x720 with 0 Axes>

In [11]:
def train_test_split(base,split_date):
  train = base[base.Date <= split_date]
  test = base[base.Date > split_date]
  print("taille base d'entrainement:",train.shape)
  print("taille base de test:",test.shape)
  return train,test


split_date = datetime.datetime(2021,1,29)
df_train,df_test = train_test_split(Sanofi_data,split_date)

taille base d'entrainement: (1042, 7)
taille base de test: (20, 7)


In [12]:
# conversion de la feature Close en values pour keras 
train = df_train.iloc[:, 4:5]
training_set = df_train.iloc[:, 4:5].values

# conversion de la feature Close en values pour keras 
test = df_test.iloc[:, 4:5]
testing_set = df_test.iloc[:, 4:5].values

In [13]:
train.head()

Unnamed: 0,Close
0,78.300003
1,78.300003
2,78.269997
3,76.690002
4,77.519997


In [14]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)

In [15]:
# Creation du train set pour une prévision de la valeur de l'action au jour j 
# basée sur les 60 jours précédents (3 mois) 
X_train = []
y_train = []
# Dans X_train, les 60 jours précédents le jour i
# Dans y_train, le jour i
# training_set_scaled étant une array, il faut ajouter le numéro de colonne, soit 0 
for i in range(60, len(training_set)):
    X_train.append(training_set_scaled[i-60:i, 0])
    y_train.append(training_set_scaled[i, 0])
# transformation des listes X_train et y_train en array avec numpy    
X_train, y_train = np.array(X_train), np.array(y_train)

In [16]:
X_train.shape,y_train.shape

((982, 60), (982,))

In [17]:
X_train

array([[0.48552133, 0.48552133, 0.48455588, ..., 0.64639649, 0.63256121,
        0.65186618],
       [0.48552133, 0.48455588, 0.43371952, ..., 0.63256121, 0.65186618,
        0.66698849],
       [0.48455588, 0.43371952, 0.4604246 , ..., 0.65186618, 0.66698849,
        0.68532824],
       ...,
       [0.52027018, 0.59362946, 0.75933067, ..., 0.5871943 , 0.58848133,
        0.57464606],
       [0.59362946, 0.75933067, 0.72265115, ..., 0.58848133, 0.57464606,
        0.510296  ],
       [0.75933067, 0.72265115, 0.64703988, ..., 0.57464606, 0.510296  ,
        0.51093939]])

In [18]:
X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
X_train.shape

(982, 60, 1)

## Partie 1 RNN

In [19]:
# Initialisation RNN avec Sequential()
regressor = Sequential()

# première couche LSTM et régularisation Dropout
# units = nbre de neurones de la couche
# return_sequences=True car dans le réseau on empile plusieurs couches LSTM
# dans input_shape, indication du train set avec le nbre de time steps (60) et le nbre de feature, ici 1 
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.2))



In [20]:
# deuxième couche LSTM layer et régularisation Dropout identique couche précédente
# entrées de cette couche = sortie de la couche précédente : input_shape inutile 
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))

# troisième couche LSTM layer et régularisation Dropout
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))


regressor.add(LSTM(units = 50, return_sequences = False))
regressor.add(Dropout(0.2))

In [21]:
# couche de sortie 1 seul neurone
regressor.add(Dense(units = 1))

# Compilation RNN
# optimizer : cf doc Keras adam (ou RMSprop recommandé pour les RNN)
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 60, 50)            10400     
_________________________________________________________________
dropout (Dropout)            (None, 60, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 60, 50)            20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 60, 50)            20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 60, 50)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                2

In [22]:
regressor.fit(X_train, y_train, epochs = 50, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f91e81ae7d0>

In [23]:
real_stock_price = testing_set

# prédictions du feature Close 
# concaténation des datasets train et test d'origine, au niveau des lignes avec axis = 0 (l'un au dessus de l'autre)
dataset_total = pd.concat((df_train['Close'], df_test['Close']), axis = 0)
# dans inputs on met les 60 jours précédents le jour à prédire et ceci pour tous les jours du mois à prédire :
# transformation en array pour keras
inputs = dataset_total[len(dataset_total) - len(df_test) - 60:].values
# redimensionnement nécessaire du dataset 
inputs = inputs.reshape(-1,1)
# utilisation du meme objet sc utilisée pour l'entrainement
inputs = sc.transform(inputs)

# alimentation du X_test avec les valeurs inputs correspondantes 
X_test = []
# range de 60 à 80 puisque test set de 20 lignes
for i in range(60, 80):
    X_test.append(inputs[i-60:i, 0])
# transformation en array    
X_test = np.array(X_test)
# redimensionnement en 3 dimensions avec reshape pour obtenir un X_test avec 20 lignes, 60 colonnes et 1 en troisième dimension
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# prévisions avec fonction predict sur le modèle regressor 
predicted_stock_price = regressor.predict(X_test)
# prévisions sur l'échelle d'orignie avec transformation inverse
predicted_stock_price = sc.inverse_transform(predicted_stock_price)

In [24]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print("le R2 est:",r2_score(real_stock_price, predicted_stock_price))
print("le mse est:",mean_squared_error(real_stock_price, predicted_stock_price))




le R2 est: 0.2762908580501685
le mse est: 1.4671398516511545


In [33]:
result = pd.DataFrame([real_stock_price,predicted_stock_price], columns=["Valeur réelle", "Valeur Predite"])

ValueError: ignored

In [69]:
df_predict = pd.DataFrame(predicted_stock_price, columns=["Prediction"],index = df_test.index)
df_true = pd.DataFrame(real_stock_price, columns=["Valeur"],index = df_test.index)
df_result = pd.concat([df_true,df_predict,df_test.Date],axis= 1)
df_result["error"] = df_result["Prediction"] - df_result["Valeur"]

In [68]:
import plotly.graph_objects as go


# Visu résultats vraies valeurs versus prévisions avec 50 époques
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_result["Date"],y=df_result["Prediction"],name = "Predictions"))
fig.add_trace(go.Scatter(x=df_result["Date"],y=df_result["Valeur"],name = "valeurs"))
fig.show()


In [70]:
px.scatter(df_result,x="Date" , y="error")

## Partie2

In [23]:
def build_LSTM():
  regressor = Sequential() 
  regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (X_train.shape[1], 1)))
  regressor.add(Dropout(0.2))
  regressor.add(LSTM(units = 100, return_sequences = True))
  regressor.add(Dropout(0.2))
  regressor.add(LSTM(units = 100, return_sequences = False))
  regressor.add(Dropout(0.2))
  regressor.add(Dense(units = 1))

  regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')
  regressor.summary()

  return regressor


regressor2 = build_LSTM()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 60, 100)           40800     
_________________________________________________________________
dropout_4 (Dropout)          (None, 60, 100)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 60, 100)           80400     
_________________________________________________________________
dropout_5 (Dropout)          (None, 60, 100)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [24]:
regressor2.fit(X_train, y_train, epochs = 50, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f31e50ee7d0>

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 

LSTM_cv = KerasClassifier(build_fn = build_LSTM, batch_size = 10, epochs = 60)

parameters = {'batch_size': [16, 32, 64],
              'epochs': [10, 50, 100],}
grid_search = GridSearchCV(estimator = LSTM_cv,
                           param_grid = parameters,
                           scoring = 'neg_mean_squared_error',
                           cv = 3)
# entrainement 
grid_search = grid_search.fit(X_train, y_train)
# meilleurs résultats 
best_parameters = grid_search.best_params_
best_mse = grid_search.best_score_

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 60, 100)           40800     
_________________________________________________________________
dropout_7 (Dropout)          (None, 60, 100)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 60, 100)           80400     
_________________________________________________________________
dropout_8 (Dropout)          (None, 60, 100)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_10 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_11 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_13 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_13 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_14 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_15 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_16 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_16 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_17 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_18 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_18 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_19 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_19 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_20 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_21 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_22 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_22 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_23 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_23 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_24 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_24 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_25 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_25 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_26 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_27 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_27 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                


`model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).



Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_28 (LSTM)               (None, 60, 100)           40800     
_________________________________________________________________
dropout_28 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_29 (LSTM)               (None, 60, 100)           80400     
_________________________________________________________________
dropout_29 (Dropout)         (None, 60, 100)           0         
_________________________________________________________________
lstm_30 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_30 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                

In [4]:
best_parameters

NameError: ignored

In [None]:
# prévisions avec fonction predict sur le modèle regressor 
predicted_stock_price2 = regressor2.predict(X_test)
# prévisions sur l'échelle d'orignie avec transformation inverse
predicted_stock_price2 = sc.inverse_transform(predicted_stock_price2)

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print("le R2 est:",r2_score(real_stock_price, predicted_stock_price2))
print("le mse est:",mean_squared_error(real_stock_price, predicted_stock_price2))

In [None]:
plt.figure(figsize=(8,4))
plt.plot(real_stock_price, color = 'red', label = 'Valeur réelle action')
plt.plot(predicted_stock_price2, color = 'blue', label = 'Valeur prédite action')
plt.title('Prediction valeur action RNN')
plt.xlabel('Time')
plt.ylabel('Prix action')
plt.legend()
plt.show()

## Partie 3

### Volume

In [None]:
Sanofi_data.head()

In [None]:
fig = px.scatter(Sanofi_data, x="Volume", y="Close")
fig.show()

### Saisonalité

In [None]:
Sanofi_data["year"] = Sanofi_data["Date"].dt.year
Sanofi_data["month"] = Sanofi_data["Date"].dt.month
Sanofi_data["week"] = Sanofi_data["Date"].dt.weekofyear
Sanofi_data["day_week"] = Sanofi_data["Date"].dt.dayofweek
Sanofi_data["day_month"] = Sanofi_data["Date"].dt.day
Sanofi_data["day_year"] = Sanofi_data["Date"].dt.dayofyear




In [None]:
list_date = ["year","month","week","day_week","day_month","day_year"]
for date in list_date:
  result = pd.DataFrame(Sanofi_data.groupby(date)["Close"].agg("mean")).reset_index()
  fig = px.line(result, x=date, y="Close")
  fig.show()

**on trouve une saisonalité dans les jours du mois tout les 7 jours**

