# Fin du prétraitement : des input de même longueur 

Pour pouvoir exécuter notre réseau de neurone (CNN), nous avons besoin de traiter encore nos données. 

En effet, notre modèle va apprendre les caractéristiques générales de plusieurs vols entre deux Water_Washes, pour chaque avion. Cependant, pour que nous puissions effectuer un réseau de neurones, nous avons besoin d'avoir des intervalles "de même longueur, c'est à dire avec le même nombre de vols pour chaque intervalles. Pour l'instant, certains avions ont plus de 9000 vols entre deux Water-Washes tandis que d'autres en ont très peu, et c'est pourquoi nous devons les traiter.

In [65]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [66]:
#Importation des données pré-traitées
path_df = r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/pretraitement.csv'
safran=pd.read_csv(path_df ,sep=',', encoding='latin-1')

In [67]:
safran

Unnamed: 0.1,date,Unnamed: 0,engine_serial_number,engine_family,engine_series,cycles,cycles_counter,egt_margin,var_mot_1,flight_leg_hours,...,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_family_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,...,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_family_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,...,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_family_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,...,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_family_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,...,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_family_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,...,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_family_1,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,...,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_family_1,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,...,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_family_1,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,...,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_family_1,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,...,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


In [68]:
safran.columns

Index(['date', 'Unnamed: 0', 'engine_serial_number', 'engine_family',
       'engine_series', 'cycles', 'cycles_counter', 'egt_margin', 'var_mot_1',
       'flight_leg_hours', 'event_rank', 'egt_slope', 'SV_indicator',
       'SV_rank', 'Config_B_indicator', 'Config_B_rank', 'WW_indicator',
       'WW_rank', 'config_A', 'config_B', 'var_env_1', 'var_env_2',
       'var_env_3', 'var_env_4', 'var_env_5', 'Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope'],
      dtype='object')

In [69]:
#Pour plus de clarté, je ne garde que les colonnes qui ont déja été travaillées et celles nécessaire au traitement des intervalles
safran_2 = safran[['date', "engine_serial_number",'engine_series', 'cycles', 'cycles_counter','Interpolate_egt_margin',
       'Interpolate_var_mot_1','event_rank', 'config_A', 'config_B','Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope']]


In [70]:
safran_2

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


## 1) Choix de la taille de l'intervalle

In [71]:
#Pour plus de lisibilité, je ne conserve qu'une colonne
safran_3 = safran_2[["engine_serial_number", "Interpolate_WW_rank", "Interpolate_egt_slope"]]

In [72]:
# On a un .count(), donc on peut prend n'importe quelle colonne pour vérifier combien il y a de vols
safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,Interpolate_WW_rank,Unnamed: 2_level_1
ESN_1,0.0,9884
ESN_1,1.0,2116
ESN_1,2.0,3552
ESN_1,3.0,1000
ESN_1,4.0,2584
...,...,...
ESN_998,0.0,505
ESN_998,1.0,38
ESN_999,0.0,465
ESN_999,1.0,442


On remarque qu'on a souvent plus de données dans l'intervalle avant le premier WaterWash (Interpolate_WW_rank=0), puis un peu moins pour les intervalles suivants. 

- **Intervalles avec le moins de données**

Regardons maintenant les cas où l'on a le moins de données pour un même intervalle.

In [73]:
safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count().sort_values(by="Interpolate_egt_slope").head(120)

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,Interpolate_WW_rank,Unnamed: 2_level_1
ESN_201,4.0,1
ESN_137,1.0,1
ESN_1073,2.0,2
ESN_164,3.0,2
ESN_1043,2.0,2
...,...,...
ESN_766,5.0,29
ESN_765,5.0,29
ESN_1078,2.0,29
ESN_300,8.0,29


In [74]:
saf_group = safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count().sort_values(by="Interpolate_egt_slope")
saf_group = saf_group.rename(columns = {'Interpolate_egt_slope': 'Nb_vols_entre_WW'})

#On a encore engine_serial_number et Interpolate_WW_rank en index, on les enlève
saf_group = saf_group.reset_index()

#On réindexe, pour avoir une colonne avec les identifiants de chaque intervalle, 
#qu'on appelle id_int pour identifiant intervalle
saf_group['id_int'] = saf_group.index
saf_group.sort_values(by="Nb_vols_entre_WW")
saf_group


Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
6345,ESN_5,0.0,2672,6345
6346,ESN_15,0.0,2676,6346
6347,ESN_10,0.0,2827,6347
6348,ESN_1,2.0,3552,6348


In [75]:
saf_group.dtypes

engine_serial_number     object
Interpolate_WW_rank     float64
Nb_vols_entre_WW          int64
id_int                    int64
dtype: object

In [76]:
#De cette manière, on peut visualiser les intervalles où le nombre de vols est inférieur à un certain seuil
saf_group[(saf_group.Nb_vols_entre_WW <= 25)]

Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
101,ESN_941,0.0,24,101
102,ESN_824,2.0,24,102
103,ESN_680,2.0,24,103
104,ESN_1311,1.0,24,104


In [77]:
#test 25
print("Le nombre d'intervalles de temps avec moins de 25 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 25)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 50 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 50)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 100 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 150 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 150)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")

Le nombre d'intervalles de temps avec moins de 25 vols correspond à 1.67 % de nos données
Le nombre d'intervalles de temps avec moins de 50 vols correspond à 3.23 % de nos données
Le nombre d'intervalles de temps avec moins de 100 vols correspond à 7.86 % de nos données
Le nombre d'intervalles de temps avec moins de 150 vols correspond à 14.17 % de nos données


Arbitrairement, je choisis de partir avec 25 vols pour perdre le moins d'intervalles possibles, mais cette valeur peut être revue à la hausse plus tard : 


## 2) L'échantillonnage

In [78]:
saf_group.sort_values(by="Nb_vols_entre_WW")

Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
6345,ESN_5,0.0,2672,6345
6346,ESN_15,0.0,2676,6346
6347,ESN_10,0.0,2827,6347
6348,ESN_1,2.0,3552,6348


In [79]:
safran_2_int = pd.merge(safran_2, saf_group, left_on = ["engine_serial_number", "Interpolate_WW_rank"], right_on = ["engine_serial_number", "Interpolate_WW_rank"], how="inner")
safran_2_int

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,...,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,...,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,...,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,...,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


### A) 25 vols mimimum par intervalles

In [80]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 25)] # Les vols qu'on supprime
df_keep= safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,25)] #Les vols qu'on garde
df_keep
#On conserve uniquement les intervalles où on a plus de 25 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,...,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,...,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,...,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,...,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


In [81]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech = df_keep.groupby("id_int").sample(25) #On utilise sample(25) pour avoir 25 données aléatoire par intervalle
df_ech

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
2416407,2022-10-20 02:43:10,ESN_1170,Engine_series_1,372.019300,386,0.307502,0.947934,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.593356,-0.266357,-1.819222,0.0,0.416712,0.003373,26,106
2416426,2022-10-27 05:10:43,ESN_1170,Engine_series_1,398.118700,413,0.019561,0.470322,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.489473,-0.351626,-1.186014,1.0,0.619466,0.003373,26,106
2416406,2022-10-19 12:19:56,ESN_1170,Engine_series_1,369.792800,384,0.371915,0.734473,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.502017,-0.324236,-1.234928,0.0,0.853132,0.003373,26,106
2416402,2022-10-18 02:40:08,ESN_1170,Engine_series_1,364.634700,380,0.428537,0.916208,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.503016,-0.324236,-1.129705,1.0,1.154726,0.003373,26,106
2416405,2022-10-19 06:33:44,ESN_1170,Engine_series_1,368.908700,383,0.261940,0.462902,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.477602,-0.355809,-0.077484,0.0,0.652069,0.003373,26,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,2019-05-27 19:24:31,ESN_1,Engine_series_1,258.071179,264,0.426081,0.612023,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.509403,0.273855,0.448627,0.0,0.400740,-0.029193,9884,6349
7536,2019-05-14 19:27:10,ESN_1,Engine_series_1,145.913066,137,0.616776,0.662348,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.548929,0.234348,0.764293,0.0,-1.810950,-0.029193,9884,6349
3016,2019-07-02 12:16:52,ESN_1,Engine_series_1,559.202216,566,-0.003554,0.941109,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.484570,0.197381,-0.498372,0.0,0.853132,-0.029193,9884,6349
3606,2019-10-15 14:01:39,ESN_1,Engine_series_1,1191.787913,1195,-0.590962,0.548731,0,Config_A_1,Config_B_1,...,0.0,0.0,0.233896,0.044857,-0.603595,0.0,0.048881,-0.029193,9884,6349


In [82]:
print("On s'attend à avoir une base de données avec",25*(6350-106),"lignes")
print("Notre base de données a", df_ech.shape[0],"lignes")
if df_ech.shape[0] == 25*(6350-106):
    print("C'est génial, youpi!!!")

On s'attend à avoir une base de données avec 156100 lignes
Notre base de données a 156100 lignes
C'est génial, youpi!!!


Remarque pour améliorer l'échantillonnage : 
on pourrait faire de la data_augmentation, ça peut demander un peu plus de temps, mais ça permettrait de garder plus de données. Dans mon notebook, on ne garde que 25 données par intervalles, si on veut, on peut facilement en garder 100. Pour en garder plus, il faudrait après regarder les intervalles où il n'y a pas assez de données et en ajouter artificiellement, comme ça on pourrait quand même nous en servir. 
Qu'en dis tu? 
Bisous! 

### B) 50 vols minimum par intervalles
- si on choisit 50 vols min par intervalles, ça donne ça : 
    

In [83]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais_50 = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 50)] # Les vols qu'on supprime
df_keep_50= safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,50)] #Les vols qu'on garde
df_keep_50
#On conserve uniquement les intervalles où on a plus de 50 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,0,Config_A_3,Config_B_1,...,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,1407
2450220,2022-10-27 12:43:08,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,1407
2450221,2022-10-27 15:30:42,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,1407
2450222,2022-10-27 18:33:59,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,0,Config_A_3,Config_B_1,...,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,1407


In [84]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech = df_keep_50.groupby("id_int").sample(50) #On utilise sample(50) pour avoir 50 données aléatoire par intervalle
df_ech

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
1935163,2022-10-20 11:19:20,ESN_670,Engine_series_1,1911.302000,2058,1.045185,-1.372593,4,Config_A_2,Config_B_1,...,0.0,4.0,0.448250,-0.368522,-0.287928,0.0,1.154726,-0.038316,51,207
1935179,2022-10-23 18:29:11,ESN_670,Engine_series_1,1926.943000,2076,0.899707,0.056497,4,Config_A_2,Config_B_1,...,0.0,4.0,-0.435395,-0.367416,-1.550594,0.0,-0.403511,-0.038316,51,207
1935165,2022-10-20 21:27:48,ESN_670,Engine_series_1,1913.308000,2060,0.666871,0.935665,4,Config_A_2,Config_B_1,...,0.0,4.0,-0.512188,-0.057779,0.132960,0.0,-2.062279,-0.038316,51,207
1935164,2022-10-20 19:00:49,ESN_670,Engine_series_1,1912.821000,2059,0.764588,0.053129,4,Config_A_2,Config_B_1,...,0.0,4.0,-0.498151,-0.337710,-0.603595,0.0,-1.760685,-0.038316,51,207
1935148,2022-10-16 17:34:15,ESN_670,Engine_series_1,1893.568000,2041,0.643510,-0.133035,4,Config_A_2,Config_B_1,...,0.0,4.0,-0.184085,-0.367416,-2.287149,0.0,-2.112544,-0.038316,51,207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,2019-07-26 06:16:41,ESN_1,Engine_series_1,763.382699,779,-0.277881,0.556260,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.151135,1.704354,-0.926756,0.0,0.734558,-0.029193,9884,6349
3785,2019-11-19 16:11:13,ESN_1,Engine_series_1,1370.242719,1383,-0.089444,-1.042651,2,Config_A_1,Config_B_1,...,0.0,0.0,-0.483478,-0.300094,-1.129705,1.0,1.154726,0.002829,9884,6349
4886,2020-10-20 18:25:09,ESN_1,Engine_series_1,2530.174651,2543,-1.107183,-2.928952,7,Config_A_1,Config_B_1,...,4.0,0.0,1.962384,-0.076742,-1.917572,0.0,-0.042963,-0.015459,9884,6349
1982,2020-03-15 04:28:03,ESN_1,Engine_series_1,2065.845194,2075,-0.751055,0.670474,4,Config_A_1,Config_B_3,...,2.0,0.0,-0.406206,-0.034603,-0.498372,0.0,0.903397,-0.010711,9884,6349


In [85]:
print("On s'attend à avoir une base de données avec",50*(6350-saf_group[(saf_group.Nb_vols_entre_WW <= 50)].shape[0]),"lignes")
print("Notre base de données a", df_ech.shape[0],"lignes")
if df_ech.shape[0] == 50*(6350-saf_group[(saf_group.Nb_vols_entre_WW <= 50)].shape[0]):
    print("C'est génial, youpi!!!")

On s'attend à avoir une base de données avec 307150 lignes
Notre base de données a 307150 lignes
C'est génial, youpi!!!


### C) 100 vols minimum

In [86]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais_100 = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 99)] # Les vols qu'on supprime
df_keep_100 = safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,99)] #Les vols qu'on garde
df_keep_100
#On conserve uniquement les intervalles où on a plus de 50 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,0,Config_A_3,Config_B_1,...,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,1407
2450220,2022-10-27 12:43:08,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,1407
2450221,2022-10-27 15:30:42,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,1407
2450222,2022-10-27 18:33:59,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,0,Config_A_3,Config_B_1,...,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,1407


In [87]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech_100 = df_keep_100.groupby("id_int").sample(100) #On utilise sample(50) pour avoir 50 données aléatoire par intervalle
df_ech_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
2357406,2022-10-08 06:02:40,ESN_1041,Engine_series_1,938.451900,1006,0.656563,-2.899534,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.466907,-0.367285,-2.497593,0.0,0.752600,-0.024441,100,499
2357468,2022-10-22 21:10:11,ESN_1041,Engine_series_1,1004.052000,1070,0.472446,-1.521410,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.355456,-0.366347,-2.392371,0.0,0.300209,-0.024441,100,499
2357424,2022-10-12 20:23:00,ESN_1041,Engine_series_1,959.067500,1025,0.510964,-2.938187,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.244090,-0.365223,-2.076705,0.0,0.601803,-0.024441,100,499
2357433,2022-10-15 03:15:35,ESN_1041,Engine_series_1,969.320600,1034,0.361577,-0.523503,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.534512,-0.364451,-0.919261,0.0,-0.755371,-0.024441,100,499
2357442,2022-10-17 02:32:30,ESN_1041,Engine_series_1,978.152300,1043,0.579307,0.633514,2,Config_A_2,Config_B_1,...,0.0,2.0,-0.558358,-0.356625,-2.076705,0.0,-1.408825,-0.024441,100,499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,2020-01-17 19:15:34,ESN_1,Engine_series_1,1634.472838,1653,-0.935702,0.260941,4,Config_A_1,Config_B_3,...,2.0,0.0,-0.550612,0.496403,-1.388643,0.0,-0.252714,-0.010711,9884,6349
1946,2020-03-10 16:29:05,ESN_1,Engine_series_1,2029.232280,2039,-0.729590,-0.305142,4,Config_A_1,Config_B_3,...,2.0,0.0,-0.301863,-0.034603,0.343405,0.0,-0.001385,-0.010711,9884,6349
5646,2019-07-20 10:12:34,ESN_1,Engine_series_1,715.041343,731,-0.008572,0.180601,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.363389,0.197381,-0.182706,0.0,0.099146,-0.029193,9884,6349
9681,2020-09-29 08:50:31,ESN_1,Engine_series_1,2367.954109,2377,-0.561099,-0.153711,7,Config_A_1,Config_B_1,...,4.0,0.0,1.232935,0.616492,-0.498372,0.0,-0.855902,-0.015459,9884,6349


In [88]:
print("On s'attend à avoir une base de données avec",100*(6350-saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]),"lignes")
print("Notre base de données a", df_ech_100.shape[0],"lignes")
print("On supprime",round(saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]*100/saf_group.shape[0],2),"% des intervalles")
if df_ech_100.shape[0] == 100*(6350-saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]):
    print("C'est bon, on a autant de ligne que souhaité!")

On s'attend à avoir une base de données avec 585100 lignes
Notre base de données a 585100 lignes
On supprime 7.86 % des intervalles
C'est bon, on a autant de ligne que souhaité!


## 3) Derniers traitements numériques

In [89]:
# Cas 1, où l'on souhaite un unique dataframe pour X
Y = df_ech_100[["Interpolate_egt_slope", "id_int"]]
X = df_ech_100[["engine_serial_number", "engine_series","cycles_counter",'config_A', 'config_B', 'Interpolate_flight_leg_hours', 'event_rank',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', "id_int"]] # Cas où on a besoin d'un seul dataframe

In [90]:
X

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_flight_leg_hours,event_rank,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
2357406,ESN_1041,Engine_series_1,1006,Config_A_2,Config_B_1,1.953056,2,0.0,0.0,-0.466907,-0.367285,-2.497593,0.0,0.752600,499
2357468,ESN_1041,Engine_series_1,1070,Config_A_2,Config_B_1,2.988056,2,0.0,0.0,-0.355456,-0.366347,-2.392371,0.0,0.300209,499
2357424,ESN_1041,Engine_series_1,1025,Config_A_2,Config_B_1,3.186667,2,0.0,0.0,-0.244090,-0.365223,-2.076705,0.0,0.601803,499
2357433,ESN_1041,Engine_series_1,1034,Config_A_2,Config_B_1,2.523333,2,0.0,0.0,-0.534512,-0.364451,-0.919261,0.0,-0.755371,499
2357442,ESN_1041,Engine_series_1,1043,Config_A_2,Config_B_1,1.644167,2,0.0,0.0,-0.558358,-0.356625,-2.076705,0.0,-1.408825,499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,ESN_1,Engine_series_1,1653,Config_A_1,Config_B_3,1.584167,4,0.0,2.0,-0.550612,0.496403,-1.388643,0.0,-0.252714,6349
1946,ESN_1,Engine_series_1,2039,Config_A_1,Config_B_3,0.940556,4,0.0,2.0,-0.301863,-0.034603,0.343405,0.0,-0.001385,6349
5646,ESN_1,Engine_series_1,731,Config_A_1,Config_B_1,0.811111,0,0.0,0.0,-0.363389,0.197381,-0.182706,0.0,0.099146,6349
9681,ESN_1,Engine_series_1,2377,Config_A_1,Config_B_1,1.120278,7,1.0,4.0,1.232935,0.616492,-0.498372,0.0,-0.855902,6349


In [91]:
X['engine_serial_number'] = X['engine_serial_number'].str[4:]
X['engine_series'] = X['engine_series'].str[14:]
X['config_A'] = X['config_A'].str[9:]
X['config_B'] = X['config_B'].str[9:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [92]:
X

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_flight_leg_hours,event_rank,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
2357406,1041,1,1006,2,1,1.953056,2,0.0,0.0,-0.466907,-0.367285,-2.497593,0.0,0.752600,499
2357468,1041,1,1070,2,1,2.988056,2,0.0,0.0,-0.355456,-0.366347,-2.392371,0.0,0.300209,499
2357424,1041,1,1025,2,1,3.186667,2,0.0,0.0,-0.244090,-0.365223,-2.076705,0.0,0.601803,499
2357433,1041,1,1034,2,1,2.523333,2,0.0,0.0,-0.534512,-0.364451,-0.919261,0.0,-0.755371,499
2357442,1041,1,1043,2,1,1.644167,2,0.0,0.0,-0.558358,-0.356625,-2.076705,0.0,-1.408825,499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,1,1,1653,1,3,1.584167,4,0.0,2.0,-0.550612,0.496403,-1.388643,0.0,-0.252714,6349
1946,1,1,2039,1,3,0.940556,4,0.0,2.0,-0.301863,-0.034603,0.343405,0.0,-0.001385,6349
5646,1,1,731,1,1,0.811111,0,0.0,0.0,-0.363389,0.197381,-0.182706,0.0,0.099146,6349
9681,1,1,2377,1,1,1.120278,7,1.0,4.0,1.232935,0.616492,-0.498372,0.0,-0.855902,6349


In [93]:
print(X.max())
print(X.min())

engine_serial_number                  999
engine_series                           7
cycles_counter                       7655
config_A                                5
config_B                                4
Interpolate_flight_leg_hours     14.23972
event_rank                             25
Interpolate_SV_rank                   3.0
Interpolate_Config_B_rank            10.0
Interpolate_var_env_1            56.96611
Interpolate_var_env_2            21.16301
Interpolate_var_env_3            3.710513
Interpolate_var_env_4                 3.0
Interpolate_var_env_5           21.763662
id_int                               6349
dtype: object
engine_serial_number                   1
engine_series                          1
cycles_counter                         0
config_A                               1
config_B                               1
Interpolate_flight_leg_hours   -0.506389
event_rank                             0
Interpolate_SV_rank                  0.0
Interpolate_Config_B_rank   

In [94]:
# On normalise toutes les colonnes pour ne pas biaiser l'apprentissage de notre réseau de neurones
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_scale = X.copy()

X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B','Interpolate_flight_leg_hours',
                 'event_rank','Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']] = scaler.fit_transform(
    X_scale[['engine_serial_number','engine_series','cycles_counter',
                 'config_A','config_B','Interpolate_flight_leg_hours',
                 'event_rank','Interpolate_SV_rank','Interpolate_Config_B_rank',
                 'Interpolate_var_env_1','Interpolate_var_env_2',
                 'Interpolate_var_env_3','Interpolate_var_env_4','Interpolate_var_env_5']])

X_scale

Unnamed: 0,engine_serial_number,engine_series,cycles_counter,config_A,config_B,Interpolate_flight_leg_hours,event_rank,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int
2357406,0.765269,0.0,0.131417,0.25,0.000000,0.166786,0.08,0.000000,0.0,0.002485,0.000206,0.305882,0.000000,0.170635,499
2357468,0.765269,0.0,0.139778,0.25,0.000000,0.236974,0.08,0.000000,0.0,0.004421,0.000250,0.317647,0.000000,0.152778,499
2357424,0.765269,0.0,0.133899,0.25,0.000000,0.250443,0.08,0.000000,0.0,0.006355,0.000302,0.352941,0.000000,0.164683,499
2357433,0.765269,0.0,0.135075,0.25,0.000000,0.205459,0.08,0.000000,0.0,0.001311,0.000338,0.482353,0.000000,0.111111,499
2357442,0.765269,0.0,0.136251,0.25,0.000000,0.145839,0.08,0.000000,0.0,0.000896,0.000701,0.352941,0.000000,0.085317,499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571,0.000000,0.0,0.215937,0.00,0.666667,0.141770,0.16,0.000000,0.2,0.001031,0.040313,0.429872,0.000000,0.130952,6349
1946,0.000000,0.0,0.266362,0.00,0.666667,0.098124,0.16,0.000000,0.2,0.005351,0.015655,0.623529,0.000000,0.140873,6349
5646,0.000000,0.0,0.095493,0.00,0.000000,0.089346,0.00,0.000000,0.0,0.004283,0.026428,0.564706,0.000000,0.144841,6349
9681,0.000000,0.0,0.310516,0.00,0.000000,0.110312,0.28,0.333333,0.4,0.032008,0.045890,0.529412,0.000000,0.107143,6349


In [95]:
#On télécharge nos base X et Y en csv
X_scale.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X.csv')
Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Y.csv')

Liste des variables pour lesquelles on hésite :
    - var_mot_1 :
    - date :?
    - SV rank : demander confirmation
    - nombre de WW effectués : double corrélation avec Y 
    

# **BROUILLON EN DESSOUS**

In [96]:
X_and_Y = df_ech_100[["engine_serial_number", "engine_series","cycles_counter",
                      'config_A', 'config_B', 'Interpolate_flight_leg_hours', 'event_rank',
                      'Interpolate_SV_rank', 'Interpolate_Config_B_rank', 'Interpolate_var_env_1',
                      'Interpolate_var_env_2',       'Interpolate_var_env_3', 'Interpolate_var_env_4',
                      'Interpolate_var_env_5', "Interpolate_egt_slope", "id_int"]]
X_and_Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X_and_Y.csv')

Pour garder des intervalles avec 100 données différentes, tout en utilisant quand même les 499 intervalles qui comportent moins de 100 vols, on a choisit de faire de la data augmentation pour ces modèles. 

In [None]:
df_mauvais_100