# Regroupement des données par un pas de temps

In [1]:
import pandas as pd
import numpy as np

#### Chargement et one hot CBN

In [2]:
data = pd.read_csv("data/data_processed.csv")

data['date'] = data['date'].apply(lambda x : pd.Timestamp(x))

data['CBN'] = pd.Categorical(data['CBN'])

# Créer les one hot encodings pour chaque variabble
dfDummies_cbn = pd.get_dummies(data['CBN'], prefix = 'CBN')

# Concatener les one hot encodings avec les données
data = pd.concat([data, dfDummies_cbn], axis=1)

# Suppression de la variable CBBN
data.drop(columns=['CBN'], inplace=True)

# On ordonne les data par "date"
data.sort_values(by='date', inplace=True)

# Réindexage des lignes
data.reset_index(inplace=True)
data.drop(columns=['index'], inplace=True)

In [3]:
data.head()

Unnamed: 0,date,CBN_0 02,CBN_0 03,CBN_0 04,CBN_0 05,CBN_0 06,CBN_0 07,CBN_0 08,CBN_0 09,CBN_0 10,...,CBN_X 06,CBN_X 07,CBN_X 08,CBN_X 09,CBN_X 10,CBN_X 11,CBN_X 12,CBN_Y,CBN_Z,CBN_j 04
0,2018-07-21 04:52:46,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018-07-21 04:52:49,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-07-21 04:52:57,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018-07-21 04:53:01,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2018-07-21 04:53:04,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Groupement  des Data

In [11]:
# Paramètre : Regroupement des données pour les input
step_group = "90min"

grouped_data = data.groupby(pd.Grouper(key="date", freq=step_group)).sum()

grouped_data.reset_index(inplace=True)

In [12]:
grouped_data.head()

Unnamed: 0,date,CBN_0 02,CBN_0 03,CBN_0 04,CBN_0 05,CBN_0 06,CBN_0 07,CBN_0 08,CBN_0 09,CBN_0 10,...,CBN_X 06,CBN_X 07,CBN_X 08,CBN_X 09,CBN_X 10,CBN_X 11,CBN_X 12,CBN_Y,CBN_Z,CBN_j 04
0,2018-07-21 04:30:00,0.0,0.0,0.0,31.0,0.0,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-07-21 06:00:00,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-07-21 07:30:00,0.0,1.0,0.0,31.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-07-21 09:00:00,0.0,0.0,0.0,84.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-07-21 10:30:00,3.0,3.0,15.0,17.0,16.0,12.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,92.0,82.0,0.0


#### Ajout variables jour seamine, mois et heure

In [13]:
# Ajout variables
grouped_data['month'] = grouped_data['date'].apply(lambda x : x.month)
grouped_data['weekday'] = grouped_data['date'].apply(lambda x : x.weekday())
grouped_data['hour'] = grouped_data['date'].apply(lambda x : x.hour)


grouped_data['month'] = pd.Categorical(grouped_data['month'])
grouped_data['weekday'] = pd.Categorical(grouped_data['weekday'])
grouped_data['hour'] = pd.Categorical(grouped_data['hour'])

# Créer les one hot encodings pour chaque variabble
dfDummies_month = pd.get_dummies(grouped_data['month'], prefix = 'month')
dfDummies_weekday = pd.get_dummies(grouped_data['weekday'], prefix = 'weekday')
dfDummies_hour = pd.get_dummies(grouped_data['hour'], prefix = 'hour')

# Concatener les one hot encodings avec les données
grouped_data = pd.concat([grouped_data, dfDummies_weekday, dfDummies_month, dfDummies_hour], axis=1)

# Suppression de la variable mois et jour de la semaine
grouped_data.drop(columns=['month', 'weekday', 'hour'], inplace=True)

In [14]:
grouped_data.to_csv("data/data_processed_{}".format(step_group), index=False)

### Test

In [15]:
test_df = pd.read_csv("data/data_processed_{}".format(step_group))

test_df.head()

Unnamed: 0,date,CBN_0 02,CBN_0 03,CBN_0 04,CBN_0 05,CBN_0 06,CBN_0 07,CBN_0 08,CBN_0 09,CBN_0 10,...,hour_9,hour_10,hour_12,hour_13,hour_15,hour_16,hour_18,hour_19,hour_21,hour_22
0,2018-07-21 04:30:00,0.0,0.0,0.0,31.0,0.0,16.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2018-07-21 06:00:00,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2018-07-21 07:30:00,0.0,1.0,0.0,31.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2018-07-21 09:00:00,0.0,0.0,0.0,84.0,0.0,4.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,2018-07-21 10:30:00,3.0,3.0,15.0,17.0,16.0,12.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
