# Fin du prétraitement : des inputs de même longueur et prêts pour le CNN

Pour pouvoir exécuter notre réseau de neurone (CNN), nous avons besoin de traiter encore nos données. 

En effet, notre modèle va apprendre les caractéristiques générales de plusieurs vols entre deux évenèments (WW ou SV), pour chaque avion. Cependant, pour que nous puissions effectuer un réseau de neurones, nous avons besoin d'avoir des intervalles "de même longueur", c'est à dire avec le même nombre de vols pour chaque intervalles. Pour l'instant, certains avions ont plus de 9000 vols entre deux évènements tandis que d'autres en ont très peu, et c'est pourquoi nous devons les traiter.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
#Importation des données pré-traitées par Louise
path_df = r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/pretraitement.csv'
safran=pd.read_csv(path_df ,sep=',', encoding='latin-1')

In [3]:
safran

Unnamed: 0.1,date,Unnamed: 0,engine_serial_number,engine_family,engine_series,cycles,cycles_counter,egt_margin,var_mot_1,flight_leg_hours,...,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_family_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,...,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_family_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,...,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_family_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,...,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_family_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,...,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_family_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,...,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_family_1,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,...,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_family_1,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,...,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_family_1,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,...,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_family_1,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,...,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


In [4]:
safran.columns

Index(['date', 'Unnamed: 0', 'engine_serial_number', 'engine_family',
       'engine_series', 'cycles', 'cycles_counter', 'egt_margin', 'var_mot_1',
       'flight_leg_hours', 'event_rank', 'egt_slope', 'SV_indicator',
       'SV_rank', 'Config_B_indicator', 'Config_B_rank', 'WW_indicator',
       'WW_rank', 'config_A', 'config_B', 'var_env_1', 'var_env_2',
       'var_env_3', 'var_env_4', 'var_env_5', 'Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope'],
      dtype='object')

In [5]:
#Pour plus de clarté, je ne garde que les colonnes qui ont déja été travaillées et celles nécessaires au traitement des intervalles
safran_2 = safran[['date', "engine_serial_number",'engine_series', 'cycles', 'cycles_counter','Interpolate_egt_margin',
       'Interpolate_var_mot_1','event_rank', 'config_A', 'config_B','Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope']]


In [6]:
safran_2

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


## 1) Choix de la taille de l'intervalle

On utilise la variable **event_rank** pour identifier tous les intervalles pendant lequel aucun évènement n'a été effectué.

In [7]:
safran_ER = safran_2[["engine_serial_number", "event_rank", "Interpolate_egt_slope"]]

In [8]:
# On a un .count(), donc on peut prend n'importe quelle colonne pour vérifier combien il y a de vols
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_1,0,4548
ESN_1,1,644
ESN_1,2,468
ESN_1,3,380
ESN_1,4,2200
...,...,...
ESN_998,1,140
ESN_998,2,32
ESN_999,0,466
ESN_999,1,444


- **Intervalles avec le moins de données**


Regardons maintenant les cas où l'on a le moins de données pour un même intervalle.

In [9]:
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope").head(220)

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_454,5,1
ESN_255,14,1
ESN_35,15,1
ESN_32,17,1
ESN_653,6,1
...,...,...
ESN_1243,1,10
ESN_181,3,10
ESN_137,7,10
ESN_346,6,10


In [10]:
saf_ER_group = safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope")
saf_ER_group = saf_ER_group.rename(columns = {'Interpolate_egt_slope': 'Nb_vols_entre_event'})

#On a encore engine_serial_number et Interpolate_WW_rank en index, on les enlève
saf_ER_group = saf_ER_group.reset_index()

#On réindexe, pour avoir une colonne avec les identifiants de chaque intervalle, 
#qu'on appelle id_int pour identifiant intervalle
saf_ER_group['id_int'] = saf_ER_group.index
saf_ER_group.sort_values(by="Nb_vols_entre_event")
saf_ER_group

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
8434,ESN_1,4,2200,8434
8435,ESN_19,0,2241,8435
8436,ESN_250,8,2275,8436
8437,ESN_1,10,3568,8437


In [11]:
#De cette manière, on peut visualiser les intervalles où le nombre de vols est inférieur à un certain seuil
saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)]

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
1818,ESN_165,12,99,1818
1819,ESN_146,4,99,1819
1820,ESN_59,2,99,1820
1821,ESN_44,2,99,1821


In [12]:
print("Le nombre d'intervalles de temps avec moins de 25 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 25)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 50 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 50)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 100 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 150 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 150)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")

Le nombre d'intervalles de temps avec moins de 25 vols correspond à 5.18 % des intervalles
Le nombre d'intervalles de temps avec moins de 50 vols correspond à 9.93 % des intervalles
Le nombre d'intervalles de temps avec moins de 100 vols correspond à 21.6 % des intervalles
Le nombre d'intervalles de temps avec moins de 150 vols correspond à 30.93 % des intervalles


In [13]:
#On merge pour avoir l'identifiant dans la base de données
safran_complete = pd.merge(safran_2, saf_ER_group, left_on = ["engine_serial_number", "event_rank"], right_on = ["engine_serial_number", "event_rank"], how="inner")
safran_complete

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,...,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,576
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,...,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,576
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,...,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,576
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,...,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,576


### 2) L'échantillonnage avec 100 vols par intervalles

Après avoir réfléchi pour conserver à al fois beaucoup d'intervalles différents, mais également le plus de vols possibles au sein de ces intervalles, , on a choisit de conserver les intervalles avec plus de 100 vols.

In [14]:
df_mauvais_100 = safran_complete[safran_complete["Nb_vols_entre_event"].between(0, 99)] # Les vols qu'on supprime
df_keep_100 = safran_complete[~safran_complete["Nb_vols_entre_event"].between(0,99)] #Les vols qu'on garde
df_keep_100
#On conserve uniquement les intervalles où on a plus de 100 dans df_keep_100 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,0,Config_A_3,Config_B_1,...,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,3443
2450220,2022-10-27 12:43:08,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,3443
2450221,2022-10-27 15:30:42,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,3443
2450222,2022-10-27 18:33:59,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,0,Config_A_3,Config_B_1,...,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,3443


In [15]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech_100 = df_keep_100.groupby("id_int").sample(100) #On utilise sample(100) pour avoir 100 données aléatoires par intervalle
df_ech_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568988,2022-10-14 06:34:37,ESN_139,Engine_series_1,4041.503000,3693,-2.444986,-0.951950,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.593696,-0.190762,-0.814039,0.0,0.350475,-0.027761,100,1823
568978,2022-10-12 03:34:53,ESN_139,Engine_series_1,4030.505000,3683,-2.367410,-1.096681,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.279419,-0.190762,-1.024483,2.0,1.456320,-0.027761,100,1823
569058,2022-10-26 09:41:22,ESN_139,Engine_series_1,4155.227000,3768,-2.126824,-0.278873,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.540540,-0.190762,-0.182706,0.0,-0.252714,-0.027761,100,1823
569006,2022-10-17 09:50:00,ESN_139,Engine_series_1,4069.094000,3713,-2.418072,-0.766817,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.479706,-0.348270,0.974738,0.0,-0.504042,-0.027761,100,1823
568994,2022-10-15 05:48:25,ESN_139,Engine_series_1,4049.202000,3700,-2.480325,-0.886022,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.565708,-0.318391,0.869515,0.0,-1.408825,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2019-05-01 03:22:01,ESN_1,Engine_series_1,29.613500,26,0.711334,0.625227,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.384528,0.193871,0.343405,0.0,0.903397,-0.029193,4548,8438
1737,2019-07-09 08:10:46,ESN_1,Engine_series_1,618.140631,624,0.036045,0.727460,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.322119,0.197381,0.238183,0.0,-0.705105,-0.029193,4548,8438
2229,2019-10-04 21:51:58,ESN_1,Engine_series_1,1139.109563,1148,-1.125274,-3.005082,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.353384,-0.229253,-2.708038,0.0,1.104460,-0.029193,4548,8438
493,2019-06-26 14:12:11,ESN_1,Engine_series_1,507.081403,514,0.027476,0.859739,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.589267,-0.026220,-0.077484,0.0,-1.961747,-0.029193,4548,8438


In [16]:
print("En effectuant cette méthode, on aura", df_ech_100.shape[0], "lignes dans notre base de données, alors que la base contenait", safran_2.shape[0], "lignes.")

En effectuant cette méthode, on aura 661600 lignes dans notre base de données, alors que la base contenait 2450275 lignes.


In [17]:
df_ech_100.sort_values(by=["id_int", "Interpolate_egt_slope"])

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568988,2022-10-14 06:34:37,ESN_139,Engine_series_1,4041.503000,3693,-2.444986,-0.951950,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.593696,-0.190762,-0.814039,0.0,0.350475,-0.027761,100,1823
568978,2022-10-12 03:34:53,ESN_139,Engine_series_1,4030.505000,3683,-2.367410,-1.096681,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.279419,-0.190762,-1.024483,2.0,1.456320,-0.027761,100,1823
569058,2022-10-26 09:41:22,ESN_139,Engine_series_1,4155.227000,3768,-2.126824,-0.278873,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.540540,-0.190762,-0.182706,0.0,-0.252714,-0.027761,100,1823
569006,2022-10-17 09:50:00,ESN_139,Engine_series_1,4069.094000,3713,-2.418072,-0.766817,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.479706,-0.348270,0.974738,0.0,-0.504042,-0.027761,100,1823
568994,2022-10-15 05:48:25,ESN_139,Engine_series_1,4049.202000,3700,-2.480325,-0.886022,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.565708,-0.318391,0.869515,0.0,-1.408825,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,2019-05-01 03:22:01,ESN_1,Engine_series_1,29.613500,26,0.711334,0.625227,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.384528,0.193871,0.343405,0.0,0.903397,-0.029193,4548,8438
1737,2019-07-09 08:10:46,ESN_1,Engine_series_1,618.140631,624,0.036045,0.727460,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.322119,0.197381,0.238183,0.0,-0.705105,-0.029193,4548,8438
2229,2019-10-04 21:51:58,ESN_1,Engine_series_1,1139.109563,1148,-1.125274,-3.005082,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.353384,-0.229253,-2.708038,0.0,1.104460,-0.029193,4548,8438
493,2019-06-26 14:12:11,ESN_1,Engine_series_1,507.081403,514,0.027476,0.859739,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.589267,-0.026220,-0.077484,0.0,-1.961747,-0.029193,4548,8438


### 3) Les derniers traitements numériques

Enfin, nous choisissons quelles varibles éliminer : 
- **engine_family** car dans notre base de donnés, tous les moteurs d'avions sont issus de la même famille de moteur
- **date** : car les informations sur la temporalité sont contenues dans cycle_counter
- **cycle** : pour la même raison que la variable "date"
- **event_rank** : car les informations que nous souhaitions ont déjà servi pour constituer la variable id_int qui identifie tous les intervalles

In [18]:
Y = df_ech_100[["Interpolate_egt_slope", "id_int"]]
X = df_ech_100[["engine_serial_number", "engine_series",
                "cycles_counter",'config_A', 'config_B', "Interpolate_var_mot_1",
                'Interpolate_flight_leg_hours', 'Interpolate_SV_rank','Interpolate_WW_rank',
                'Interpolate_Config_B_rank', 'Interpolate_var_env_1',
                'Interpolate_var_env_2','Interpolate_var_env_3', 
                'Interpolate_var_env_4','Interpolate_var_env_5', "id_int"]] # Cas où on a besoin d'un seul dataframe

In [19]:
X

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568988,ESN_139,Engine_series_1,3693,Config_A_2,Config_B_4,-0.951950,1.084444,1.0,8.0,6.0,-0.593696,-0.190762,-0.814039,0.0,0.350475,1823
568978,ESN_139,Engine_series_1,3683,Config_A_2,Config_B_4,-1.096681,2.155000,1.0,8.0,6.0,-0.279419,-0.190762,-1.024483,2.0,1.456320,1823
569058,ESN_139,Engine_series_1,3768,Config_A_2,Config_B_4,-0.278873,1.764722,1.0,8.0,6.0,-0.540540,-0.190762,-0.182706,0.0,-0.252714,1823
569006,ESN_139,Engine_series_1,3713,Config_A_2,Config_B_4,-0.766817,0.912778,1.0,8.0,6.0,-0.479706,-0.348270,0.974738,0.0,-0.504042,1823
568994,ESN_139,Engine_series_1,3700,Config_A_2,Config_B_4,-0.886022,1.796111,1.0,8.0,6.0,-0.565708,-0.318391,0.869515,0.0,-1.408825,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,ESN_1,Engine_series_1,26,Config_A_1,Config_B_1,0.625227,0.838333,0.0,0.0,0.0,-0.384528,0.193871,0.343405,0.0,0.903397,8438
1737,ESN_1,Engine_series_1,624,Config_A_1,Config_B_1,0.727460,0.705278,0.0,0.0,0.0,-0.322119,0.197381,0.238183,0.0,-0.705105,8438
2229,ESN_1,Engine_series_1,1148,Config_A_1,Config_B_1,-3.005082,0.627500,0.0,0.0,0.0,-0.353384,-0.229253,-2.708038,0.0,1.104460,8438
493,ESN_1,Engine_series_1,514,Config_A_1,Config_B_1,0.859739,0.888056,0.0,0.0,0.0,-0.589267,-0.026220,-0.077484,0.0,-1.961747,8438


In [20]:
X_float=X.copy()

In [21]:
X_float.loc[:,"engine_serial_number"] = X['engine_serial_number'].str[4:]
X_float.loc[:,'engine_serial_number'] = X['engine_serial_number'].str[4:]
X_float.loc[:,'engine_series'] = X['engine_series'].str[14:]
X_float.loc[:,'config_A'] = X['config_A'].str[9:]
X_float.loc[:,'config_B'] = X['config_B'].str[9:]
X_float= X_float.astype('float')

In [22]:
X_float

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568988,139.0,1.0,3693.0,2.0,4.0,-0.951950,1.084444,1.0,8.0,6.0,-0.593696,-0.190762,-0.814039,0.0,0.350475,1823.0
568978,139.0,1.0,3683.0,2.0,4.0,-1.096681,2.155000,1.0,8.0,6.0,-0.279419,-0.190762,-1.024483,2.0,1.456320,1823.0
569058,139.0,1.0,3768.0,2.0,4.0,-0.278873,1.764722,1.0,8.0,6.0,-0.540540,-0.190762,-0.182706,0.0,-0.252714,1823.0
569006,139.0,1.0,3713.0,2.0,4.0,-0.766817,0.912778,1.0,8.0,6.0,-0.479706,-0.348270,0.974738,0.0,-0.504042,1823.0
568994,139.0,1.0,3700.0,2.0,4.0,-0.886022,1.796111,1.0,8.0,6.0,-0.565708,-0.318391,0.869515,0.0,-1.408825,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,1.0,1.0,26.0,1.0,1.0,0.625227,0.838333,0.0,0.0,0.0,-0.384528,0.193871,0.343405,0.0,0.903397,8438.0
1737,1.0,1.0,624.0,1.0,1.0,0.727460,0.705278,0.0,0.0,0.0,-0.322119,0.197381,0.238183,0.0,-0.705105,8438.0
2229,1.0,1.0,1148.0,1.0,1.0,-3.005082,0.627500,0.0,0.0,0.0,-0.353384,-0.229253,-2.708038,0.0,1.104460,8438.0
493,1.0,1.0,514.0,1.0,1.0,0.859739,0.888056,0.0,0.0,0.0,-0.589267,-0.026220,-0.077484,0.0,-1.961747,8438.0


In [23]:
# On normalise toutes les colonnes pour ne pas biaiser l'apprentissage de notre réseau de neurones
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scale = X_float.copy()

X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']] = scaler.fit_transform(
    X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B',"Interpolate_var_mot_1",'Interpolate_flight_leg_hours',
                 'Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']])

X_scale

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_WW_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
568988,0.101545,0.0,0.481424,0.333333,1.0,0.352873,0.110285,0.333333,8.0,0.6,0.000283,0.008404,0.494118,0.000000,0.169935,1823.0
568978,0.101545,0.0,0.480120,0.333333,1.0,0.329934,0.184502,0.333333,8.0,0.6,0.005741,0.008404,0.470588,0.666667,0.217865,1823.0
569058,0.101545,0.0,0.491201,0.333333,1.0,0.459549,0.157446,0.333333,8.0,0.6,0.001206,0.008404,0.564706,0.000000,0.143791,1823.0
569006,0.101545,0.0,0.484031,0.333333,1.0,0.382215,0.098384,0.333333,8.0,0.6,0.002263,0.001089,0.694118,0.000000,0.132898,1823.0
568994,0.101545,0.0,0.482336,0.333333,1.0,0.363322,0.159622,0.333333,8.0,0.6,0.000769,0.002477,0.682353,0.000000,0.093682,1823.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,0.000000,0.0,0.003389,0.000000,0.0,0.602840,0.093223,0.000000,0.0,0.0,0.003916,0.026265,0.623529,0.000000,0.193900,8438.0
1737,0.000000,0.0,0.081345,0.000000,0.0,0.619043,0.083999,0.000000,0.0,0.0,0.005000,0.026428,0.611765,0.000000,0.124183,8438.0
2229,0.000000,0.0,0.149655,0.000000,0.0,0.027471,0.078607,0.000000,0.0,0.0,0.004457,0.006616,0.282353,0.000000,0.202614,8438.0
493,0.000000,0.0,0.067006,0.000000,0.0,0.640008,0.096670,0.000000,0.0,0.0,0.000360,0.016044,0.576471,0.000000,0.069717,8438.0


In [24]:
Y_doublon = Y.copy()

In [25]:
Y_doublon.drop_duplicates(subset = "id_int",keep = 'first', inplace=True)

In [26]:
Y_doublon

Unnamed: 0,Interpolate_egt_slope,id_int
568988,-0.027761,1823
2358351,-0.025889,1824
1910446,-0.034384,1825
225301,-0.019224,1826
2344999,-0.047450,1827
...,...,...
6506,-0.010711,8434
104208,-0.004308,8435
928242,-0.061415,8436
12641,-0.006345,8437


On a 6616 lignes, donc aucun doublon. Les bases peuvent être téléchargées.

In [27]:
#On télécharge nos base X et Y en csv
X_scale.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X.csv')
Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Y.csv')