# Fin du prétraitement : des input de même longueur 

Pour pouvoir exécuter notre réseau de neurone (CNN), nous avons besoin de traiter encore nos données. 

En effet, notre modèle va apprendre les caractéristiques générales de plusieurs vols entre deux Water_Washes, pour chaque avion. Cependant, pour que nous puissions effectuer un réseau de neurones, nous avons besoin d'avoir des intervalles "de même longueur, c'est à dire avec le même nombre de vols pour chaque intervalles. Pour l'instant, certains avions ont plus de 9000 vols entre deux Water-Washes tandis que d'autres en ont très peu, et c'est pourquoi nous devons les traiter.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
#Importation des données pré-traitées
path_df = r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/pretraitement.csv'
safran=pd.read_csv(path_df ,sep=',', encoding='latin-1')

In [3]:
safran

Unnamed: 0.1,date,Unnamed: 0,engine_serial_number,engine_family,engine_series,cycles,cycles_counter,egt_margin,var_mot_1,flight_leg_hours,...,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_family_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,...,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_family_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,...,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_family_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,...,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_family_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,...,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_family_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,...,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_family_1,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,...,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_family_1,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,...,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_family_1,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,...,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_family_1,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,...,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


In [4]:
safran.columns

Index(['date', 'Unnamed: 0', 'engine_serial_number', 'engine_family',
       'engine_series', 'cycles', 'cycles_counter', 'egt_margin', 'var_mot_1',
       'flight_leg_hours', 'event_rank', 'egt_slope', 'SV_indicator',
       'SV_rank', 'Config_B_indicator', 'Config_B_rank', 'WW_indicator',
       'WW_rank', 'config_A', 'config_B', 'var_env_1', 'var_env_2',
       'var_env_3', 'var_env_4', 'var_env_5', 'Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope'],
      dtype='object')

In [29]:
#Pour plus de clarté, je ne garde que les colonnes qui ont déja été travaillées et celles nécessaire au traitement des intervalles
safran = safran.rename(columns = {'Unnamed: 0': 'id_vol'}) # On a un identifiant par vol
safran_2 = safran[['date','id_vol', "engine_serial_number",'engine_series', 'cycles', 'cycles_counter','Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope']]


In [30]:
safran_2

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


## 1) Choix de la taille de l'intervalle

In [31]:
#Pour plus de lisibilité, je ne conserve qu'une colonne
safran_3 = safran_2[["engine_serial_number", "Interpolate_WW_rank", "Interpolate_egt_slope"]]

In [32]:
# On a un .count(), donc on peut prend n'importe quelle colonne pour vérifier combien il y a de vols
safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,Interpolate_WW_rank,Unnamed: 2_level_1
ESN_1,0.0,9884
ESN_1,1.0,2116
ESN_1,2.0,3552
ESN_1,3.0,1000
ESN_1,4.0,2584
...,...,...
ESN_998,0.0,505
ESN_998,1.0,38
ESN_999,0.0,465
ESN_999,1.0,442


On remarque qu'on a souvent plus de données dans l'intervalle avant le premier WaterWash (Interpolate_WW_rank=0), puis un peu moins pour les intervalles suivants. 

- **Intervalles avec le moins de données**

Regardons maintenant les cas où l'on a le moins de données pour un même intervalle.

In [33]:
safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count().sort_values(by="Interpolate_egt_slope").head(120)

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,Interpolate_WW_rank,Unnamed: 2_level_1
ESN_201,4.0,1
ESN_137,1.0,1
ESN_1073,2.0,2
ESN_164,3.0,2
ESN_1043,2.0,2
...,...,...
ESN_766,5.0,29
ESN_765,5.0,29
ESN_1078,2.0,29
ESN_300,8.0,29


In [34]:
saf_group = safran_3.groupby(by=["engine_serial_number", "Interpolate_WW_rank"]).count().sort_values(by="Interpolate_egt_slope")
saf_group = saf_group.rename(columns = {'Interpolate_egt_slope': 'Nb_vols_entre_WW'})

#On a encore engine_serial_number et Interpolate_WW_rank en index, on les enlève
saf_group = saf_group.reset_index()

#On réindexe, pour avoir une colonne avec les identifiants de chaque intervalle, 
#qu'on appelle id_int pour identifiant intervalle
saf_group['id_int'] = saf_group.index
saf_group.sort_values(by="Nb_vols_entre_WW")
saf_group


Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
6345,ESN_5,0.0,2672,6345
6346,ESN_15,0.0,2676,6346
6347,ESN_10,0.0,2827,6347
6348,ESN_1,2.0,3552,6348


In [35]:
saf_group.dtypes

engine_serial_number     object
Interpolate_WW_rank     float64
Nb_vols_entre_WW          int64
id_int                    int64
dtype: object

In [36]:
#De cette manière, on peut visualiser les intervalles où le nombre de vols est inférieur à un certain seuil
saf_group[(saf_group.Nb_vols_entre_WW <= 25)]

Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
101,ESN_941,0.0,24,101
102,ESN_824,2.0,24,102
103,ESN_680,2.0,24,103
104,ESN_1311,1.0,24,104


In [37]:
#test 25
print("Le nombre d'intervalles de temps avec moins de 25 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 25)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 50 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 50)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 100 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")
print("Le nombre d'intervalles de temps avec moins de 150 vols correspond à", round((saf_group[(saf_group.Nb_vols_entre_WW < 150)].shape[0]/saf_group.shape[0])*100,2), "% de nos données")

Le nombre d'intervalles de temps avec moins de 25 vols correspond à 1.67 % de nos données
Le nombre d'intervalles de temps avec moins de 50 vols correspond à 3.23 % de nos données
Le nombre d'intervalles de temps avec moins de 100 vols correspond à 7.86 % de nos données
Le nombre d'intervalles de temps avec moins de 150 vols correspond à 14.17 % de nos données


Arbitrairement, je choisis de partir avec 25 vols pour perdre le moins d'intervalles possibles, mais cette valeur peut être revue à la hausse plus tard : 


## 2) L'échantillonnage

In [38]:
saf_group.sort_values(by="Nb_vols_entre_WW")

Unnamed: 0,engine_serial_number,Interpolate_WW_rank,Nb_vols_entre_WW,id_int
0,ESN_201,4.0,1,0
1,ESN_137,1.0,1,1
2,ESN_1073,2.0,2,2
3,ESN_164,3.0,2,3
4,ESN_1043,2.0,2,4
...,...,...,...,...
6345,ESN_5,0.0,2672,6345
6346,ESN_15,0.0,2676,6346
6347,ESN_10,0.0,2827,6347
6348,ESN_1,2.0,3552,6348


In [39]:
safran_2_int = pd.merge(safran_2, saf_group, left_on = ["engine_serial_number", "Interpolate_WW_rank"], right_on = ["engine_serial_number", "Interpolate_WW_rank"], how="inner")
safran_2_int

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,1,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,2,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,3,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,4,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,5,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


### A) 25 vols mimimum par intervalles

In [40]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 25)] # Les vols qu'on supprime
df_keep= safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,25)] #Les vols qu'on garde
df_keep
#On conserve uniquement les intervalles où on a plus de 25 données

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,1,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,2,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,3,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,4,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,5,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


In [41]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech = df_keep.groupby("id_int").sample(25) #On utilise sample(25) pour avoir 25 données aléatoire par intervalle
df_ech

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
2416411,2022-10-20 19:21:42,2832807,ESN_1170,Engine_series_1,374.547400,390,0.560847,-0.943725,3.970833,0.0,0.0,2.0,-0.459548,-0.364009,0.027738,0.0,1.154726,0.003373,26,106
2416419,2022-10-24 23:48:43,2832818,ESN_1170,Engine_series_1,389.963100,401,-0.027260,0.948283,1.225000,0.0,0.0,2.0,-0.448419,-0.211525,-1.292272,0.0,0.601803,0.003373,26,106
2416423,2022-10-26 03:06:26,2832823,ESN_1170,Engine_series_1,394.145400,406,0.181678,0.908483,1.746944,0.0,0.0,2.0,-0.609686,-0.355470,-2.544050,0.0,0.836155,0.003373,26,106
2416404,2022-10-19 02:51:41,2832799,ESN_1170,Engine_series_1,368.341700,382,0.411357,0.607712,1.986389,0.0,0.0,2.0,-0.552820,-0.324236,-1.234928,0.0,0.853132,0.003373,26,106
2416416,2022-10-23 11:43:06,2832815,ESN_1170,Engine_series_1,384.408500,398,0.159919,0.593845,1.682778,0.0,0.0,2.0,-0.604862,-0.235055,-1.971483,0.0,-1.006699,0.003373,26,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4263,2020-02-19 18:47:17,1863,ESN_1,Engine_series_1,1868.064900,1876,-0.673243,0.643185,0.816389,0.0,2.0,0.0,-0.537207,0.344733,-0.287928,0.0,-0.252714,-0.010711,9884,6349
1778,2020-02-18 04:39:46,1849,ESN_1,Engine_series_1,1855.561449,1862,-0.600742,0.292900,0.659167,0.0,2.0,0.0,-0.573250,0.344733,-0.708817,0.0,0.601803,-0.010711,9884,6349
6824,2020-03-01 18:52:00,1958,ESN_1,Engine_series_1,1956.298025,1971,-0.683740,0.396109,0.872500,0.0,2.0,0.0,-0.318618,0.443280,-0.044691,0.0,0.573741,-0.010711,9884,6349
1267,2019-11-10 13:09:08,1322,ESN_1,Engine_series_1,1322.980762,1335,-0.065855,0.206447,2.282778,0.0,0.0,0.0,-0.205915,-0.171105,-1.445372,0.0,0.249943,-0.012429,9884,6349


In [42]:
print("On s'attend à avoir une base de données avec",25*(6350-106),"lignes")
print("Notre base de données a", df_ech.shape[0],"lignes")
if df_ech.shape[0] == 25*(6350-106):
    print("C'est génial, youpi!!!")

On s'attend à avoir une base de données avec 156100 lignes
Notre base de données a 156100 lignes
C'est génial, youpi!!!


Remarque pour améliorer l'échantillonnage : 
on pourrait faire de la data_augmentation, ça peut demander un peu plus de temps, mais ça permettrait de garder plus de données. Dans mon notebook, on ne garde que 25 données par intervalles, si on veut, on peut facilement en garder 100. Pour en garder plus, il faudrait après regarder les intervalles où il n'y a pas assez de données et en ajouter artificiellement, comme ça on pourrait quand même nous en servir. 
Qu'en dis tu? 
Bisous! 

### B) 50 vols minimum par intervalles
- si on choisit 50 vols min par intervalles, ça donne ça : 
    

In [43]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais_50 = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 50)] # Les vols qu'on supprime
df_keep_50= safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,50)] #Les vols qu'on garde
df_keep_50
#On conserve uniquement les intervalles où on a plus de 50 données

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,1,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,2,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,3,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,4,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,5,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,2910474,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,1.854167,0.0,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,1407
2450220,2022-10-27 12:43:08,2910475,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,1.973611,0.0,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,1407
2450221,2022-10-27 15:30:42,2910476,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,1.688056,0.0,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,1407
2450222,2022-10-27 18:33:59,2910477,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,1.695556,0.0,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,1407


In [44]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech = df_keep_50.groupby("id_int").sample(50) #On utilise sample(50) pour avoir 50 données aléatoire par intervalle
df_ech

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
1935160,2022-10-19 18:34:02,2175415,ESN_670,Engine_series_1,1907.991000,2055,0.726972,0.495573,1.903056,0.0,0.0,4.0,-0.455667,-0.363668,-2.076705,0.0,-1.207762,-0.038316,51,207
1935147,2022-10-16 13:32:26,2175400,ESN_670,Engine_series_1,1892.772000,2040,0.826093,0.078292,2.429167,0.0,0.0,4.0,-0.322545,-0.368522,0.132960,0.0,0.551538,-0.038316,51,207
1935182,2022-10-24 19:07:34,2175441,ESN_670,Engine_series_1,1931.813000,2081,0.166856,-0.022420,2.578333,0.0,0.0,4.0,-0.282127,-0.365772,-2.602815,0.0,0.702335,-0.038316,51,207
1935177,2022-10-23 12:46:52,2175433,ESN_670,Engine_series_1,1925.816000,2073,0.815145,0.585787,1.103611,0.0,0.0,4.0,-0.486213,-0.367801,-0.708817,0.0,0.048881,-0.038316,51,207
1935163,2022-10-20 11:19:20,2175418,ESN_670,Engine_series_1,1911.302000,2058,1.045185,-1.372593,4.999167,0.0,0.0,4.0,0.448250,-0.368522,-0.287928,0.0,1.154726,-0.038316,51,207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,2019-07-27 16:14:19,779,ESN_1,Engine_series_1,775.208119,792,-0.270082,0.495902,1.310000,0.0,0.0,0.0,0.181906,0.197381,-1.024483,0.0,0.551538,-0.029193,9884,6349
8079,2019-07-16 10:59:37,680,ESN_1,Engine_series_1,680.359232,693,-0.261371,0.177900,0.639444,0.0,0.0,0.0,-0.308185,0.197381,-0.603595,0.0,-1.006699,-0.029193,9884,6349
4866,2020-10-18 05:47:11,2508,ESN_1,Engine_series_1,2511.021238,2521,-0.461634,0.393220,1.465278,1.0,4.0,0.0,-0.130263,-0.214442,-0.919261,0.0,-0.906168,-0.015459,9884,6349
3951,2019-12-28 16:02:06,1544,ESN_1,Engine_series_1,1539.840187,1557,-0.422299,0.180514,0.846944,0.0,1.0,0.0,-0.428849,1.127458,-0.393150,0.0,0.300209,-0.178054,9884,6349


In [45]:
print("On s'attend à avoir une base de données avec",50*(6350-saf_group[(saf_group.Nb_vols_entre_WW <= 50)].shape[0]),"lignes")
print("Notre base de données a", df_ech.shape[0],"lignes")
if df_ech.shape[0] == 50*(6350-saf_group[(saf_group.Nb_vols_entre_WW <= 50)].shape[0]):
    print("C'est génial, youpi!!!")

On s'attend à avoir une base de données avec 307150 lignes
Notre base de données a 307150 lignes
C'est génial, youpi!!!


### C) 100 vols minimum

In [46]:
#Je garde cette cellule pour l'instant, mais on peut voir après pour la supprimer, et effectuer tout en une seule étape

df_mauvais_100 = safran_2_int[safran_2_int["Nb_vols_entre_WW"].between(0, 99)] # Les vols qu'on supprime
df_keep_100 = safran_2_int[~safran_2_int["Nb_vols_entre_WW"].between(0,99)] #Les vols qu'on garde
df_keep_100
#On conserve uniquement les intervalles où on a plus de 50 données

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
0,2019-04-29 06:29:58,1,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
1,2019-04-29 08:10:00,2,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
2,2019-04-29 09:55:00,3,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,9884,6349
3,2019-04-29 11:36:53,4,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,9884,6349
4,2019-04-30 04:28:40,5,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,9884,6349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,2910474,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,1.854167,0.0,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,1407
2450220,2022-10-27 12:43:08,2910475,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,1.973611,0.0,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,1407
2450221,2022-10-27 15:30:42,2910476,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,1.688056,0.0,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,1407
2450222,2022-10-27 18:33:59,2910477,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,1.695556,0.0,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,1407


In [47]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech_100 = df_keep_100.groupby("id_int").sample(100) #On utilise sample(50) pour avoir 50 données aléatoire par intervalle
df_ech_100

Unnamed: 0,date,id_vol,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
2357414,2022-10-10 15:52:23,2737332,ESN_1041,Engine_series_1,949.256200,1015,0.664362,-1.933435,2.185278,0.0,0.0,2.0,0.001609,-0.367439,-1.550594,0.0,0.350475,-0.024441,100,499
2357459,2022-10-20 23:35:46,2737377,ESN_1041,Engine_series_1,995.537500,1060,0.439520,-2.939349,2.698611,0.0,0.0,2.0,-0.512229,-0.364451,-0.287928,0.0,-1.961747,-0.024441,100,499
2357468,2022-10-22 21:10:11,2737387,ESN_1041,Engine_series_1,1004.052000,1070,0.472446,-1.521410,2.988056,0.0,0.0,2.0,-0.355456,-0.366347,-2.392371,0.0,0.300209,-0.024441,100,499
2357450,2022-10-18 18:59:55,2737368,ESN_1041,Engine_series_1,985.710800,1051,0.203914,1.052000,0.811111,0.0,0.0,2.0,-0.413768,-0.337710,-0.498373,0.0,-1.760685,-0.024441,100,499
2357446,2022-10-17 20:56:35,2737364,ESN_1041,Engine_series_1,981.591700,1047,0.573475,-0.642301,1.270833,0.0,0.0,2.0,-0.558607,-0.057779,-0.919261,1.0,1.003929,-0.024441,100,499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4647,2020-04-12 06:23:42,2266,ESN_1,Engine_series_1,2248.694246,2279,-1.425646,0.475491,0.846944,0.0,4.0,0.0,0.361568,0.017431,0.238183,0.0,0.451006,-0.323653,9884,6349
2535,2019-05-08 11:17:23,65,ESN_1,Engine_series_1,92.154056,78,0.634190,0.279600,0.577222,0.0,0.0,0.0,-0.503053,0.193871,1.606070,0.0,-1.609887,-0.029193,9884,6349
4828,2020-10-12 18:05:56,2467,ESN_1,Engine_series_1,2469.426396,2480,-0.439295,0.206563,1.378333,1.0,4.0,0.0,-0.548564,-0.245680,-1.234928,0.0,0.350475,-0.015459,9884,6349
4905,2020-10-23 18:14:44,2553,ESN_1,Engine_series_1,2552.859819,2566,-0.520648,0.480065,0.809722,1.0,4.0,0.0,-0.591061,-0.245680,-1.761038,0.0,0.048881,-0.015459,9884,6349


In [48]:
print("On s'attend à avoir une base de données avec",100*(6350-saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]),"lignes")
print("Notre base de données a", df_ech_100.shape[0],"lignes")
print("On supprime",saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]*100/saf_group.shape[0],"% des intervalles")
if df_ech_100.shape[0] == 100*(6350-saf_group[(saf_group.Nb_vols_entre_WW < 100)].shape[0]):
    print("C'est bon, on a autant de ligne que souhaité!")

On s'attend à avoir une base de données avec 585100 lignes
Notre base de données a 585100 lignes
On supprime 7.858267716535433 % des intervalles
C'est bon, on a autant de ligne que souhaité!


In [49]:
# Cas 1, où l'on souhaite un unique dataframe pour X
Y = df_ech_100[["id_vol", "Interpolate_egt_slope", "id_int"]]
X = df_ech_100[["id_vol",'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', "id_int"]] # Cas où on a besoin d'un seul dataframe

#On télécharge nos base X et Y en csv
Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Y.csv')
X.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X.csv')

In [50]:
X_and_Y = df_ech_100[["id_vol", 'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5',"Interpolate_egt_slope", "id_int"]]
X_and_Y.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X_and_Y.csv')

In [None]:
#NE FONCTIONNE PAS ENCORE
# Cas 2, où l'on souhaite un dataframe par intervalle
#A ne pas lancer si on ne fait pas cette méthode, parce que... ça fait plein de fichier créés!
df_ech_100_cut = df_ech_100[['Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', "id_int"]]

for i in df_ech_100_cut['id_int'].unique():
    X_int = pd.DataFrame()
    X_int = df_ech[(df_ech_100_cut.id_int== i)]
    print(X_int)
    #X.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X.csv')

# **BROUILLON EN DESSOUS**

Pour garder des intervalles avec 100 données différentes, tout en utilisant quand même les 499 intervalles qui comportent moins de 100 vols, on a choisit de faire de la data augmentation pour ces modèles. 

In [45]:
df_mauvais_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
28647,2022-10-22 03:17:54,ESN_2,Engine_series_1,6675.176971,7660,0.019131,0.870992,1.234167,2.0,6.0,7.0,-0.501230,-0.335819,-1.340150,0.0,-0.152182,-0.016202,32,123
28648,2022-10-22 05:15:48,ESN_2,Engine_series_1,6675.705197,7661,-0.146212,0.696096,1.436667,2.0,6.0,7.0,-0.474284,-0.356649,-1.340150,0.0,-0.956433,-0.016202,32,123
28649,2022-10-22 07:55:48,ESN_2,Engine_series_1,6676.422043,7662,-0.000742,-0.097141,2.588333,2.0,6.0,7.0,-0.429894,-0.335819,-1.129705,0.0,-0.554308,-0.016202,32,123
28650,2022-10-22 11:23:26,ESN_2,Engine_series_1,6677.352300,7663,-0.013258,-0.353523,2.309722,2.0,6.0,7.0,-0.448197,-0.321669,-1.129705,0.0,0.099146,-0.016202,32,123
28651,2022-10-22 15:28:17,ESN_2,Engine_series_1,6678.449298,7664,-0.010209,0.523306,0.633889,2.0,6.0,7.0,-0.493353,-0.335819,-0.919261,0.0,-0.101917,-0.016202,32,123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


In [65]:
df_chelou = pd.DataFrame()
for i in df_mauvais_100['id_int'].unique():
    nb_valeur_a_ajouter = 100 - df_mauvais_100[df_mauvais_100.id_int == i]["Nb_vols_entre_WW"].count()
    
    print("Le nombre de valeur à ajouter pour l'intervalle numéro",i, "est de", nb_valeur_a_ajouter)
    for i in range nb_valeur_a_ajouter
    df_mauvais_100 = pd.concat([df_mauvais_100, pd.DataFrame({"Interpolate_egt_margin" : row[0]*(1+uniform(-p,p)),
                                        "RSI" : row[1]*(1+uniform(-p,p)),
                                        "STO_K" : row[2]*(1+uniform(-p,p)),
                                        "D" : row[3]*(1+uniform(-p,p)),
                                        "20d-50d" :row[4]*(1+uniform(-p,p)),
                                        "momentum" : row[5]*(1+uniform(-p,p)),
                                        "capm" : row[6]*(1+uniform(-p,p)),
                                        "Signal" : row["Signal"]
                                        },index = [end + i*d])])
   

Le nombre de valeur à ajouter pour l'intervalle numéro 123 est de 68
Le nombre de valeur à ajouter pour l'intervalle numéro 170 est de 59
Le nombre de valeur à ajouter pour l'intervalle numéro 356 est de 19
Le nombre de valeur à ajouter pour l'intervalle numéro 468 est de 6
Le nombre de valeur à ajouter pour l'intervalle numéro 80 est de 83
Le nombre de valeur à ajouter pour l'intervalle numéro 447 est de 8
Le nombre de valeur à ajouter pour l'intervalle numéro 62 est de 86
Le nombre de valeur à ajouter pour l'intervalle numéro 398 est de 12
Le nombre de valeur à ajouter pour l'intervalle numéro 85 est de 83
Le nombre de valeur à ajouter pour l'intervalle numéro 389 est de 13
Le nombre de valeur à ajouter pour l'intervalle numéro 186 est de 56
Le nombre de valeur à ajouter pour l'intervalle numéro 48 est de 89
Le nombre de valeur à ajouter pour l'intervalle numéro 154 est de 64
Le nombre de valeur à ajouter pour l'intervalle numéro 196 est de 52
Le nombre de valeur à ajouter pour l'int

Le nombre de valeur à ajouter pour l'intervalle numéro 488 est de 3
Le nombre de valeur à ajouter pour l'intervalle numéro 381 est de 16
Le nombre de valeur à ajouter pour l'intervalle numéro 405 est de 11
Le nombre de valeur à ajouter pour l'intervalle numéro 308 est de 27
Le nombre de valeur à ajouter pour l'intervalle numéro 163 est de 61
Le nombre de valeur à ajouter pour l'intervalle numéro 435 est de 9
Le nombre de valeur à ajouter pour l'intervalle numéro 107 est de 74
Le nombre de valeur à ajouter pour l'intervalle numéro 439 est de 8
Le nombre de valeur à ajouter pour l'intervalle numéro 218 est de 47
Le nombre de valeur à ajouter pour l'intervalle numéro 395 est de 12
Le nombre de valeur à ajouter pour l'intervalle numéro 385 est de 14
Le nombre de valeur à ajouter pour l'intervalle numéro 264 est de 37
Le nombre de valeur à ajouter pour l'intervalle numéro 146 est de 65
Le nombre de valeur à ajouter pour l'intervalle numéro 256 est de 39
Le nombre de valeur à ajouter pour l'

Le nombre de valeur à ajouter pour l'intervalle numéro 33 est de 93
Le nombre de valeur à ajouter pour l'intervalle numéro 73 est de 85
Le nombre de valeur à ajouter pour l'intervalle numéro 34 est de 93
Le nombre de valeur à ajouter pour l'intervalle numéro 152 est de 64
Le nombre de valeur à ajouter pour l'intervalle numéro 37 est de 93
Le nombre de valeur à ajouter pour l'intervalle numéro 287 est de 31
Le nombre de valeur à ajouter pour l'intervalle numéro 288 est de 31
Le nombre de valeur à ajouter pour l'intervalle numéro 440 est de 8
Le nombre de valeur à ajouter pour l'intervalle numéro 301 est de 28
Le nombre de valeur à ajouter pour l'intervalle numéro 300 est de 28
Le nombre de valeur à ajouter pour l'intervalle numéro 161 est de 62
Le nombre de valeur à ajouter pour l'intervalle numéro 27 est de 94
Le nombre de valeur à ajouter pour l'intervalle numéro 241 est de 43
Le nombre de valeur à ajouter pour l'intervalle numéro 228 est de 44
Le nombre de valeur à ajouter pour l'int

In [68]:
df_mauvais_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW,id_int
28647,2022-10-22 03:17:54,ESN_2,Engine_series_1,6675.176971,7660,0.019131,0.870992,1.234167,2.0,6.0,7.0,-0.501230,-0.335819,-1.340150,0.0,-0.152182,-0.016202,32,123
28648,2022-10-22 05:15:48,ESN_2,Engine_series_1,6675.705197,7661,-0.146212,0.696096,1.436667,2.0,6.0,7.0,-0.474284,-0.356649,-1.340150,0.0,-0.956433,-0.016202,32,123
28649,2022-10-22 07:55:48,ESN_2,Engine_series_1,6676.422043,7662,-0.000742,-0.097141,2.588333,2.0,6.0,7.0,-0.429894,-0.335819,-1.129705,0.0,-0.554308,-0.016202,32,123
28650,2022-10-22 11:23:26,ESN_2,Engine_series_1,6677.352300,7663,-0.013258,-0.353523,2.309722,2.0,6.0,7.0,-0.448197,-0.321669,-1.129705,0.0,0.099146,-0.016202,32,123
28651,2022-10-22 15:28:17,ESN_2,Engine_series_1,6678.449298,7664,-0.010209,0.523306,0.633889,2.0,6.0,7.0,-0.493353,-0.335819,-0.919261,0.0,-0.101917,-0.016202,32,123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,137
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,137
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,137
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,137


In [None]:
def data_augmentation(df,p=0.01):
    """
  La valeur de p est très importante : c'est le pourcentage de modification. 
  return: DataFrame     comme df mais avec plus de lignes 
  """
    res = df_mauvais_100.copy()
    for i in df_mauvais_100['id_int'].unique():
                res = pd.concat([res, pd.DataFrame({"Interpolate_var_env_5" : row[0]*(1+uniform(-p,p)),
                                        "RSI" : row[1]*(1+uniform(-p,p)),
                                        "STO_K" : row[2]*(1+uniform(-p,p)),
                                        "D" : row[3]*(1+uniform(-p,p)),
                                        "20d-50d" :row[4]*(1+uniform(-p,p)),
                                        "momentum" : row[5]*(1+uniform(-p,p)),
                                        "capm" : row[6]*(1+uniform(-p,p)),
                                        "Signal" : row["Signal"]
                                        },index = [end + i*d])])
            i+=n
    return res

In [None]:
for j in df_mauvais_100['id_int'].unique():
    

In [None]:
# Original
def data_augmentation(df,n=3,p=0.01):
  """
  La valeur de p est très importante : c'est le pourcentage de modification. (Accuracy = 70% avec p=10%, =90% avec p=1%)
  Objectif de cette fonction est de multiplier par 4 environ le nombre de valeur achat et vente
  Donc à chaque fois qu'il y a une ligne dans df qui donne un signal vente ou achat, on en génère 3 autres. 
  Les 3 là doivent être des petites perturbations de la ligne initiale. 
  cette fonction sera appliquée à data à l'endroit indiqué ci dessous. à ce moment data a 0 : pour vendre, 1 pour conserver, 2 pour acheter

  :param df: DataFrame avec les colonnes des indicateurs et une colonne avec le signal calculé par training_set
  :param p: pourcentage de perturbation. 

  :return: DataFrame     comme df mais avec plus de lignes 
  """
  res = df.copy()
  d = pd.Timedelta('1 day')
  i=0
  for index, row in df.iterrows():
      if row["Signal"]==0 or row["Signal"]==2:
          for j in range(n):
              res = pd.concat([res, pd.DataFrame({"MACD" : row[0]*(1+uniform(-p,p)),
                                        "RSI" : row[1]*(1+uniform(-p,p)),
                                        "STO_K" : row[2]*(1+uniform(-p,p)),
                                        "D" : row[3]*(1+uniform(-p,p)),
                                        "20d-50d" :row[4]*(1+uniform(-p,p)),
                                        "momentum" : row[5]*(1+uniform(-p,p)),
                                        "capm" : row[6]*(1+uniform(-p,p)),
                                        "Signal" : row["Signal"]
                                        },index = [end + i*d])])
          i+=n
  return res

In [52]:
df_mauvais_100.groupby(["id_int"]).count()

Unnamed: 0_level_0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_WW
id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
495,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
496,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
497,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99
