# Fin du prétraitement : des inputs de même longueur et prêts pour le CNN

Pour pouvoir exécuter notre réseau de neurone (CNN), nous avons besoin de traiter encore nos données. 

En effet, notre modèle va apprendre les caractéristiques générales de plusieurs vols entre deux évenèments (WW ou SV), pour chaque avion. Cependant, pour que nous puissions effectuer un réseau de neurones, nous avons besoin d'avoir des intervalles "de même longueur", c'est à dire avec le même nombre de vols pour chaque intervalles. Pour l'instant, certains avions ont plus de 9000 vols entre deux évènements tandis que d'autres en ont très peu, et c'est pourquoi nous devons les traiter.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
#Importation des données pré-traitées par Louise
path_df = r'C:\Users\louis\OneDrive\Documents\ENSAE\2A\Safran\pretraitement.csv'
safran=pd.read_csv(path_df ,sep=',', encoding='latin-1')

In [4]:
safran

Unnamed: 0.1,date,Unnamed: 0,engine_serial_number,engine_family,engine_series,cycles,cycles_counter,egt_margin,var_mot_1,flight_leg_hours,...,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_family_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,...,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_family_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,...,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_family_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,...,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_family_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,...,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_family_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,...,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_family_1,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,...,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_family_1,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,...,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_family_1,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,...,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_family_1,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,...,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


In [5]:
safran.columns

Index(['date', 'Unnamed: 0', 'engine_serial_number', 'engine_family',
       'engine_series', 'cycles', 'cycles_counter', 'egt_margin', 'var_mot_1',
       'flight_leg_hours', 'event_rank', 'egt_slope', 'SV_indicator',
       'SV_rank', 'Config_B_indicator', 'Config_B_rank', 'WW_indicator',
       'WW_rank', 'config_A', 'config_B', 'var_env_1', 'var_env_2',
       'var_env_3', 'var_env_4', 'var_env_5', 'Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope'],
      dtype='object')

In [6]:
#Pour plus de clarté, je ne garde que les colonnes qui ont déja été travaillées et celles nécessaires au traitement des intervalles
safran_2 = safran[['date', "engine_serial_number",'engine_series', 'cycles', 'cycles_counter','Interpolate_egt_margin',
       'Interpolate_var_mot_1','event_rank', 'config_A', 'config_B','Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope']]


In [7]:
safran_2

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


## 1) Choix de la taille de l'intervalle

On utilise la variable **event_rank** pour identifier tous les intervalles pendant lequel aucun évènement n'a été effectué.

In [8]:
safran_ER = safran_2[["engine_serial_number", "event_rank", "Interpolate_egt_slope"]]

In [9]:
# On a un .count(), donc on peut prend n'importe quelle colonne pour vérifier combien il y a de vols
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_1,0,4548
ESN_1,1,644
ESN_1,2,468
ESN_1,3,380
ESN_1,4,2200
...,...,...
ESN_998,1,140
ESN_998,2,32
ESN_999,0,466
ESN_999,1,444


- **Intervalles avec le moins de données**


Regardons maintenant les cas où l'on a le moins de données pour un même intervalle.

In [10]:
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope").head(220)

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_454,5,1
ESN_255,14,1
ESN_35,15,1
ESN_32,17,1
ESN_653,6,1
...,...,...
ESN_1243,1,10
ESN_181,3,10
ESN_137,7,10
ESN_346,6,10


In [11]:
saf_ER_group = safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope")
saf_ER_group = saf_ER_group.rename(columns = {'Interpolate_egt_slope': 'Nb_vols_entre_event'})

#On a encore engine_serial_number et Interpolate_WW_rank en index, on les enlève
saf_ER_group = saf_ER_group.reset_index()

#On réindexe, pour avoir une colonne avec les identifiants de chaque intervalle, 
#qu'on appelle id_int pour identifiant intervalle
saf_ER_group['id_int'] = saf_ER_group.index
saf_ER_group.sort_values(by="Nb_vols_entre_event")
saf_ER_group

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
8434,ESN_1,4,2200,8434
8435,ESN_19,0,2241,8435
8436,ESN_250,8,2275,8436
8437,ESN_1,10,3568,8437


In [12]:
#De cette manière, on peut visualiser les intervalles où le nombre de vols est inférieur à un certain seuil
saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)]

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
1818,ESN_165,12,99,1818
1819,ESN_146,4,99,1819
1820,ESN_59,2,99,1820
1821,ESN_44,2,99,1821


In [13]:
print("Le nombre d'intervalles de temps avec moins de 25 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 25)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 50 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 50)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 100 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 150 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 150)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")

Le nombre d'intervalles de temps avec moins de 25 vols correspond à 5.18 % des intervalles
Le nombre d'intervalles de temps avec moins de 50 vols correspond à 9.93 % des intervalles
Le nombre d'intervalles de temps avec moins de 100 vols correspond à 21.6 % des intervalles
Le nombre d'intervalles de temps avec moins de 150 vols correspond à 30.93 % des intervalles


In [14]:
#On merge pour avoir l'identifiant dans la base de données
safran_complete = pd.merge(safran_2, saf_ER_group, left_on = ["engine_serial_number", "event_rank"], right_on = ["engine_serial_number", "event_rank"], how="inner")
safran_complete

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,...,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,576
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,...,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,576
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,...,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,576
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,...,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,576


### 2) L'échantillonnage avec 100 vols par intervalles

Après avoir réfléchi pour conserver à al fois beaucoup d'intervalles différents, mais également le plus de vols possibles au sein de ces intervalles, , on a choisit de conserver les intervalles avec plus de 100 vols.

In [15]:
df_mauvais_100 = safran_complete[safran_complete["Nb_vols_entre_event"].between(0, 99)] # Les vols qu'on supprime
df_keep_100 = safran_complete[~safran_complete["Nb_vols_entre_event"].between(0,99)] #Les vols qu'on garde
df_keep_100
#On conserve uniquement les intervalles où on a plus de 100 dans df_keep_100 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,0,Config_A_3,Config_B_1,...,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,3443
2450220,2022-10-27 12:43:08,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,3443
2450221,2022-10-27 15:30:42,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,3443
2450222,2022-10-27 18:33:59,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,0,Config_A_3,Config_B_1,...,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,3443


In [16]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech_100 = df_keep_100.groupby("id_int").sample(100) #On utilise sample(100) pour avoir 100 données aléatoires par intervalle
df_ech_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568985,2022-10-13 07:36:42,ESN_139,Engine_series_1,4038.203000,3690,-2.410050,-1.017128,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.560787,-0.318391,0.448627,0.0,-1.710419,-0.027761,100,1823
568972,2022-10-10 08:11:06,ESN_139,Engine_series_1,4022.806000,3676,-2.301393,-1.219844,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,-0.027761,100,1823
569047,2022-10-24 07:48:16,ESN_139,Engine_series_1,4138.000000,3757,-2.194132,-0.391638,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.394460,-0.330142,0.764293,0.0,-0.855902,-0.027761,100,1823
568982,2022-10-12 22:09:43,ESN_139,Engine_series_1,4034.904000,3687,-2.395675,-1.043947,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.589914,-0.190762,-0.919261,1.0,1.456320,-0.027761,100,1823
569034,2022-10-21 21:44:03,ESN_139,Engine_series_1,4116.076000,3743,-2.272485,-0.522906,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.373970,-0.318391,0.553849,0.0,-0.705105,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,2019-04-30 18:54:25,ESN_1,Engine_series_1,27.471388,24,0.551557,-0.161857,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,-0.029193,4548,8438
1433,2019-06-02 13:23:30,ESN_1,Engine_series_1,308.908825,313,0.318264,0.621961,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,-0.029193,4548,8438
3801,2019-06-14 10:48:29,ESN_1,Engine_series_1,404.279508,410,0.132293,0.288370,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,-0.029193,4548,8438
204,2019-05-23 06:53:45,ESN_1,Engine_series_1,216.902974,221,0.262649,0.207013,0,Config_A_1,Config_B_1,...,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,-0.029193,4548,8438


In [17]:
print("En effectuant cette méthode, on aura", df_ech_100.shape[0], "lignes dans notre base de données, alors que la base contenait", safran_2.shape[0], "lignes.")

En effectuant cette méthode, on aura 661600 lignes dans notre base de données, alors que la base contenait 2450275 lignes.


In [18]:
df_ech_100.sort_values(by=["id_int", "Interpolate_egt_slope"])

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568985,2022-10-13 07:36:42,ESN_139,Engine_series_1,4038.203000,3690,-2.410050,-1.017128,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.560787,-0.318391,0.448627,0.0,-1.710419,-0.027761,100,1823
568972,2022-10-10 08:11:06,ESN_139,Engine_series_1,4022.806000,3676,-2.301393,-1.219844,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,-0.027761,100,1823
569047,2022-10-24 07:48:16,ESN_139,Engine_series_1,4138.000000,3757,-2.194132,-0.391638,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.394460,-0.330142,0.764293,0.0,-0.855902,-0.027761,100,1823
568982,2022-10-12 22:09:43,ESN_139,Engine_series_1,4034.904000,3687,-2.395675,-1.043947,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.589914,-0.190762,-0.919261,1.0,1.456320,-0.027761,100,1823
569034,2022-10-21 21:44:03,ESN_139,Engine_series_1,4116.076000,3743,-2.272485,-0.522906,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.373970,-0.318391,0.553849,0.0,-0.705105,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,2019-04-30 18:54:25,ESN_1,Engine_series_1,27.471388,24,0.551557,-0.161857,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,-0.029193,4548,8438
1433,2019-06-02 13:23:30,ESN_1,Engine_series_1,308.908825,313,0.318264,0.621961,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,-0.029193,4548,8438
3801,2019-06-14 10:48:29,ESN_1,Engine_series_1,404.279508,410,0.132293,0.288370,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,-0.029193,4548,8438
204,2019-05-23 06:53:45,ESN_1,Engine_series_1,216.902974,221,0.262649,0.207013,0,Config_A_1,Config_B_1,...,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,-0.029193,4548,8438


### 3) Les derniers traitements numériques

Enfin, nous choisissons quelles varibles éliminer : 
- **engine_family** car dans notre base de donnés, tous les moteurs d'avions sont issus de la même famille de moteur
- **date** : car les informations sur la temporalité sont contenues dans cycle_counter
- **cycle** : pour la même raison que la variable "date"
- **event_rank** : car les informations que nous souhaitions ont déjà servi pour constituer la variable id_int qui identifie tous les intervalles

In [19]:
Y = df_ech_100[["Interpolate_egt_slope", "id_int"]]
X = df_ech_100[["engine_serial_number", "engine_series",
                "cycles_counter",'config_A', 'config_B', "Interpolate_var_mot_1",
                'Interpolate_flight_leg_hours', 'Interpolate_SV_rank','Interpolate_WW_rank',
                'Interpolate_Config_B_rank', 'Interpolate_var_env_1',
                'Interpolate_var_env_2','Interpolate_var_env_3', 
                'Interpolate_var_env_4','Interpolate_var_env_5', "id_int"]] # Cas où on a besoin d'un seul dataframe

In [20]:
X

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568985,ESN_139,Engine_series_1,3690,Config_A_2,Config_B_4,-1.017128,1.743889,1.0,8.0,6.0,-0.560787,-0.318391,0.448627,0.0,-1.710419,1823
568972,ESN_139,Engine_series_1,3676,Config_A_2,Config_B_4,-1.219844,4.332500,1.0,8.0,6.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,1823
569047,ESN_139,Engine_series_1,3757,Config_A_2,Config_B_4,-0.391638,2.108333,1.0,8.0,6.0,-0.394460,-0.330142,0.764293,0.0,-0.855902,1823
568982,ESN_139,Engine_series_1,3687,Config_A_2,Config_B_4,-1.043947,1.784444,1.0,8.0,6.0,-0.589914,-0.190762,-0.919261,1.0,1.456320,1823
569034,ESN_139,Engine_series_1,3743,Config_A_2,Config_B_4,-0.522906,1.672222,1.0,8.0,6.0,-0.373970,-0.318391,0.553849,0.0,-0.705105,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,ESN_1,Engine_series_1,24,Config_A_1,Config_B_1,-0.161857,0.978056,0.0,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,8438
1433,ESN_1,Engine_series_1,313,Config_A_1,Config_B_1,0.621961,1.209167,0.0,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,8438
3801,ESN_1,Engine_series_1,410,Config_A_1,Config_B_1,0.288370,0.649722,0.0,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,8438
204,ESN_1,Engine_series_1,221,Config_A_1,Config_B_1,0.207013,0.845833,0.0,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,8438


In [21]:
X_float=X.copy()

In [22]:
X_float.loc[:,"engine_serial_number"] = X['engine_serial_number'].str[4:]
X_float.loc[:,'engine_serial_number'] = X['engine_serial_number'].str[4:]
X_float.loc[:,'engine_series'] = X['engine_series'].str[14:]
X_float.loc[:,'config_A'] = X['config_A'].str[9:]
X_float.loc[:,'config_B'] = X['config_B'].str[9:]
X_float= X_float.astype('float')

In [23]:
X_float

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568985,139.0,1.0,3690.0,2.0,4.0,-1.017128,1.743889,1.0,8.0,6.0,-0.560787,-0.318391,0.448627,0.0,-1.710419,1823.0
568972,139.0,1.0,3676.0,2.0,4.0,-1.219844,4.332500,1.0,8.0,6.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,1823.0
569047,139.0,1.0,3757.0,2.0,4.0,-0.391638,2.108333,1.0,8.0,6.0,-0.394460,-0.330142,0.764293,0.0,-0.855902,1823.0
568982,139.0,1.0,3687.0,2.0,4.0,-1.043947,1.784444,1.0,8.0,6.0,-0.589914,-0.190762,-0.919261,1.0,1.456320,1823.0
569034,139.0,1.0,3743.0,2.0,4.0,-0.522906,1.672222,1.0,8.0,6.0,-0.373970,-0.318391,0.553849,0.0,-0.705105,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1.0,1.0,24.0,1.0,1.0,-0.161857,0.978056,0.0,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,8438.0
1433,1.0,1.0,313.0,1.0,1.0,0.621961,1.209167,0.0,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,8438.0
3801,1.0,1.0,410.0,1.0,1.0,0.288370,0.649722,0.0,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,8438.0
204,1.0,1.0,221.0,1.0,1.0,0.207013,0.845833,0.0,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,8438.0


In [24]:
# On normalise toutes les colonnes pour ne pas biaiser l'apprentissage de notre réseau de neurones
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scale = X_float.copy()

X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']] = scaler.fit_transform(
    X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']])

X_scale

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568985,0.101545,0.0,0.481032,0.333333,1.0,0.346994,0.152601,0.333333,8.0,0.6,0.000854,0.002476,0.635294,0.000000,0.073413,1823.0
568972,0.101545,0.0,0.479207,0.333333,1.0,0.314448,0.328147,0.333333,8.0,0.6,0.002172,0.000734,0.717647,0.000000,0.109127,1823.0
569047,0.101545,0.0,0.489767,0.333333,1.0,0.447416,0.177316,0.333333,8.0,0.6,0.003743,0.001930,0.670588,0.000000,0.107143,1823.0
568982,0.101545,0.0,0.480641,0.333333,1.0,0.342688,0.155352,0.333333,8.0,0.6,0.000348,0.008403,0.482353,0.333333,0.198413,1823.0
569034,0.101545,0.0,0.487942,0.333333,1.0,0.426341,0.147741,0.333333,8.0,0.6,0.004099,0.002476,0.647059,0.000000,0.113095,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,0.000000,0.0,0.003129,0.000000,0.0,0.484307,0.100667,0.000000,0.0,0.0,0.003050,0.026264,0.635294,0.000000,0.176587,8438.0
1433,0.000000,0.0,0.040803,0.000000,0.0,0.610149,0.116340,0.000000,0.0,0.0,0.007125,0.012194,0.658824,0.000000,0.115079,8438.0
3801,0.000000,0.0,0.053448,0.000000,0.0,0.556591,0.078401,0.000000,0.0,0.0,0.001224,0.012194,0.658824,0.000000,0.083333,8438.0
204,0.000000,0.0,0.028810,0.000000,0.0,0.543529,0.091700,0.000000,0.0,0.0,0.012890,0.026264,0.623529,0.000000,0.136905,8438.0


In [25]:
Y_doublon = Y.copy()

In [26]:
Y_doublon.drop_duplicates(subset = "id_int",keep = 'first', inplace=True)

In [27]:
Y_doublon

Unnamed: 0,Interpolate_egt_slope,id_int
568985,-0.027761,1823
2358436,-0.025889,1824
1910439,-0.034384,1825
225297,-0.019224,1826
2344976,-0.047450,1827
...,...,...
7399,-0.010711,8434
103056,-0.004308,8435
927264,-0.061415,8436
14895,-0.006345,8437


On a 6616 lignes, donc aucun doublon. Les bases peuvent être téléchargées.

In [None]:
#On télécharge nos base X et Y en csv
X_scale.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X.csv')
Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Y.csv')

In [33]:
# On télécharge également la base complete composée de X et de Y.
X_Y = pd.merge(X_scale,Y_doublon)

In [36]:
X_Y

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int,Interpolate_egt_slope
0,0.101545,0.0,0.481032,0.333333,1.0,0.346994,0.152601,0.333333,8.0,0.6,0.000854,0.002476,0.635294,0.000000,0.073413,1823.0,-0.027761
1,0.101545,0.0,0.479207,0.333333,1.0,0.314448,0.328147,0.333333,8.0,0.6,0.002172,0.000734,0.717647,0.000000,0.109127,1823.0,-0.027761
2,0.101545,0.0,0.489767,0.333333,1.0,0.447416,0.177316,0.333333,8.0,0.6,0.003743,0.001930,0.670588,0.000000,0.107143,1823.0,-0.027761
3,0.101545,0.0,0.480641,0.333333,1.0,0.342688,0.155352,0.333333,8.0,0.6,0.000348,0.008403,0.482353,0.333333,0.198413,1823.0,-0.027761
4,0.101545,0.0,0.487942,0.333333,1.0,0.426341,0.147741,0.333333,8.0,0.6,0.004099,0.002476,0.647059,0.000000,0.113095,1823.0,-0.027761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661595,0.000000,0.0,0.003129,0.000000,0.0,0.484307,0.100667,0.000000,0.0,0.0,0.003050,0.026264,0.635294,0.000000,0.176587,8438.0,-0.029193
661596,0.000000,0.0,0.040803,0.000000,0.0,0.610149,0.116340,0.000000,0.0,0.0,0.007125,0.012194,0.658824,0.000000,0.115079,8438.0,-0.029193
661597,0.000000,0.0,0.053448,0.000000,0.0,0.556591,0.078401,0.000000,0.0,0.0,0.001224,0.012194,0.658824,0.000000,0.083333,8438.0,-0.029193
661598,0.000000,0.0,0.028810,0.000000,0.0,0.543529,0.091700,0.000000,0.0,0.0,0.012890,0.026264,0.623529,0.000000,0.136905,8438.0,-0.029193


In [None]:
X_Y.to_csv(r'C:\Users\louis\OneDrive\Documents\ENSAE\2A\Safran\X_Y.csv')

## Prétraitement pour le CNN multihead 

On normalise de la même façon que précedemment, sauf pour les variables environnementales qui seront traitées différemment. 

In [29]:
X_float

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568985,139.0,1.0,3690.0,2.0,4.0,-1.017128,1.743889,1.0,8.0,6.0,-0.560787,-0.318391,0.448627,0.0,-1.710419,1823.0
568972,139.0,1.0,3676.0,2.0,4.0,-1.219844,4.332500,1.0,8.0,6.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,1823.0
569047,139.0,1.0,3757.0,2.0,4.0,-0.391638,2.108333,1.0,8.0,6.0,-0.394460,-0.330142,0.764293,0.0,-0.855902,1823.0
568982,139.0,1.0,3687.0,2.0,4.0,-1.043947,1.784444,1.0,8.0,6.0,-0.589914,-0.190762,-0.919261,1.0,1.456320,1823.0
569034,139.0,1.0,3743.0,2.0,4.0,-0.522906,1.672222,1.0,8.0,6.0,-0.373970,-0.318391,0.553849,0.0,-0.705105,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1.0,1.0,24.0,1.0,1.0,-0.161857,0.978056,0.0,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,8438.0
1433,1.0,1.0,313.0,1.0,1.0,0.621961,1.209167,0.0,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,8438.0
3801,1.0,1.0,410.0,1.0,1.0,0.288370,0.649722,0.0,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,8438.0
204,1.0,1.0,221.0,1.0,1.0,0.207013,0.845833,0.0,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,8438.0


In [31]:
scaler = MinMaxScaler()

X_scale_multihead = X_float.copy()

X_scale_multihead[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank']] = scaler.fit_transform(
    X_scale_multihead[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank']])

X_scale_multihead

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568985,0.101545,0.0,0.481032,0.333333,1.0,0.346994,0.152601,0.333333,8.0,0.6,-0.560787,-0.318391,0.448627,0.0,-1.710419,1823.0
568972,0.101545,0.0,0.479207,0.333333,1.0,0.314448,0.328147,0.333333,8.0,0.6,-0.484910,-0.355901,1.185182,0.0,-0.805636,1823.0
569047,0.101545,0.0,0.489767,0.333333,1.0,0.447416,0.177316,0.333333,8.0,0.6,-0.394460,-0.330142,0.764293,0.0,-0.855902,1823.0
568982,0.101545,0.0,0.480641,0.333333,1.0,0.342688,0.155352,0.333333,8.0,0.6,-0.589914,-0.190762,-0.919261,1.0,1.456320,1823.0
569034,0.101545,0.0,0.487942,0.333333,1.0,0.426341,0.147741,0.333333,8.0,0.6,-0.373970,-0.318391,0.553849,0.0,-0.705105,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,0.000000,0.0,0.003129,0.000000,0.0,0.484307,0.100667,0.000000,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,8438.0
1433,0.000000,0.0,0.040803,0.000000,0.0,0.610149,0.116340,0.000000,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,8438.0
3801,0.000000,0.0,0.053448,0.000000,0.0,0.556591,0.078401,0.000000,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,8438.0
204,0.000000,0.0,0.028810,0.000000,0.0,0.543529,0.091700,0.000000,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,8438.0


In [34]:
X_Y_multihead = pd.merge(X_scale_multihead,Y_doublon)

In [35]:
X_Y_multihead

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int,Interpolate_egt_slope
0,0.101545,0.0,0.481032,0.333333,1.0,0.346994,0.152601,0.333333,8.0,0.6,-0.560787,-0.318391,0.448627,0.0,-1.710419,1823.0,-0.027761
1,0.101545,0.0,0.479207,0.333333,1.0,0.314448,0.328147,0.333333,8.0,0.6,-0.484910,-0.355901,1.185182,0.0,-0.805636,1823.0,-0.027761
2,0.101545,0.0,0.489767,0.333333,1.0,0.447416,0.177316,0.333333,8.0,0.6,-0.394460,-0.330142,0.764293,0.0,-0.855902,1823.0,-0.027761
3,0.101545,0.0,0.480641,0.333333,1.0,0.342688,0.155352,0.333333,8.0,0.6,-0.589914,-0.190762,-0.919261,1.0,1.456320,1823.0,-0.027761
4,0.101545,0.0,0.487942,0.333333,1.0,0.426341,0.147741,0.333333,8.0,0.6,-0.373970,-0.318391,0.553849,0.0,-0.705105,1823.0,-0.027761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661595,0.000000,0.0,0.003129,0.000000,0.0,0.484307,0.100667,0.000000,0.0,0.0,-0.434365,0.193871,0.448627,0.0,0.903397,8438.0,-0.029193
661596,0.000000,0.0,0.040803,0.000000,0.0,0.610149,0.116340,0.000000,0.0,0.0,-0.199725,-0.109125,0.659071,0.0,-0.654839,8438.0,-0.029193
661597,0.000000,0.0,0.053448,0.000000,0.0,0.556591,0.078401,0.000000,0.0,0.0,-0.539523,-0.109125,0.659071,0.0,-1.459090,8438.0,-0.029193
661598,0.000000,0.0,0.028810,0.000000,0.0,0.543529,0.091700,0.000000,0.0,0.0,0.132207,0.193871,0.343405,0.0,-0.101917,8438.0,-0.029193


In [37]:
X_Y_multihead.to_csv(r'C:\Users\louis\OneDrive\Documents\ENSAE\2A\Safran\X_Y_multihead.csv')

# Préparation de l'input du CNN 

Une dernière étape de prétraitement est necessaire pour que nos données puissent entrer dans le réseau de neurones. Nous souhaitons diviser notre dataframe en 6 616 échantillons de 100 vols. De la même façon que pour une image, un échantillon est une matrice de dimensions (100,16) associée à la variable expliquée : la pente de la marge EGT. 

In [50]:
# Importation des packages 
from numpy import array, hstack

In [52]:
# On importe la base effectuée dans la partie précedente (pour ne pas relancer tout le notebook)
X_Y = pd.read_csv(r'C:\Users\louis\OneDrive\Documents\ENSAE\2A\Safran\X_Y.csv')

In [53]:
X_Y = X_Y.drop('Unnamed: 0', axis= 1)

On crée une fonction permettant de scinder un dataframe en échantillon de taille n_steps

In [55]:
def split_sequences(sequences, n_steps) : 
    X, y = list(), list()
    n_iteration = int(len(sequences)/n_steps)
    if len(sequences)%n_steps != 0 : 
        print("Attention, modulo différent de zero")
    for i in range(n_iteration): 
        seq_x, seq_y = sequences [i*n_steps :(i+1)*n_steps, :-1 ], sequences[i*n_steps ,-1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)    

In [56]:
# Application de la fonction 
X, y = split_sequences(X_Y.values, 100)

In [57]:
print(X.shape, y.shape)

(6616, 100, 16) (6616,)


In [61]:
# Pour exemple, on associe aux trois premiers échantillons leur valeur de pente de la marge EGT
for i in range(3):
    print(X[i], y[i])

[[1.01545254e-01 0.00000000e+00 4.79728849e-01 ... 0.00000000e+00
  1.84317784e-01 1.82300000e+03]
 [1.01545254e-01 0.00000000e+00 4.81814627e-01 ... 0.00000000e+00
  1.11111121e-01 1.82300000e+03]
 [1.01545254e-01 0.00000000e+00 4.86898709e-01 ... 0.00000000e+00
  9.44443767e-02 1.82300000e+03]
 ...
 [1.01545254e-01 0.00000000e+00 4.78946682e-01 ... 0.00000000e+00
  1.56746042e-01 1.82300000e+03]
 [1.01545254e-01 0.00000000e+00 4.82466432e-01 ... 0.00000000e+00
  1.26562635e-01 1.82300000e+03]
 [1.01545254e-01 0.00000000e+00 4.91461348e-01 ... 0.00000000e+00
  1.36904772e-01 1.82300000e+03]] -0.02776137
[[7.66004415e-01 0.00000000e+00 1.30621822e-01 ... 3.33333333e-01
  1.84523808e-01 1.82400000e+03]
 [7.66004415e-01 0.00000000e+00 1.38313127e-01 ... 0.00000000e+00
  1.98412701e-01 1.82400000e+03]
 [7.66004415e-01 0.00000000e+00 1.32186156e-01 ... 0.00000000e+00
  1.28968264e-01 1.82400000e+03]
 ...
 [7.66004415e-01 0.00000000e+00 1.31143267e-01 ... 0.00000000e+00
  1.52777788e-01 1.8

On échantillonne pour obtenir une base d'entrainement et de test 

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)

On teste sur un premier CNN pour vérifier :

In [65]:
# Importation des packages de deep learning 
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential 
from tensorflow.keras.layers import (Conv1D, Dense,Dropout, Flatten,MaxPooling1D)
from tensorflow.keras import optimizers
from tensorflow.keras import models

In [66]:
# define model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(100, X.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [67]:
history = model.fit(X_train, Y_train, epochs=10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [84]:
# On prédit la première valeur de la base de test pour exemple 
x_input = X_test[0]
x_input = x_input.reshape((1, 100, 16))

In [85]:
model.predict(x_input)

array([[-0.0851602]], dtype=float32)

In [74]:
# Comparaison avec la vrai valeur : 
Y_test[0]

-0.04304608

In [89]:
# On prédit les trois premières valeurs de la base test 
model.predict(X_test[0:3])

array([[-0.08522124],
       [ 0.04331117],
       [-0.05765624]], dtype=float32)

In [90]:
# On compare avec les vraies valeurs :
Y_test[0:3]

array([-0.04304608, -0.00672543, -0.2536079 ])