In [1]:
import numpy as np
import pandas as pd
import datetime

# Data Loading

In [2]:
CDC_RP = pd.read_csv('CDC_RP.csv')
CDC_RS = pd.read_csv('CDC_RS.csv')
infos_linky_edf_2018_RP = pd.read_csv('infos_linky_edf_2018_RP.csv')
infos_linky_edf_2018_RS = pd.read_csv('infos_linky_edf_2018_RS.csv')
Pmax_RP = pd.read_csv('Pmax_RP.csv')
Pmax_RS = pd.read_csv('Pmax_RS.csv')

# Data Processing

In [3]:
CDC_RP['date'] = pd.to_datetime(CDC_RP['date_msr'], dayfirst=True)
stats_primary = CDC_RP.groupby('pom').count()
keep_primary = stats_primary.loc[stats_primary.date_msr==365].index.values
CDC_RS['date'] = pd.to_datetime(CDC_RP['date_msr'], dayfirst=True)
stats_secondary = CDC_RS.groupby('pom').count()
keep_secondary = stats_secondary.loc[stats_secondary.date_msr==365].index.values

CDC_RP = CDC_RP.loc[CDC_RP.pom.isin(keep_primary)]
CDC_RS = CDC_RS.loc[CDC_RS.pom.isin(keep_secondary)]

#Sort by date for each client
CDC_RP = CDC_RP.groupby('pom').apply(lambda x : x.sort_values('date'))
CDC_RS = CDC_RS.groupby('pom').apply(lambda x : x.sort_values('date'))

#Split by client
x_RP = CDC_RP.conso_jour.values
x_RS = CDC_RS.conso_jour.values
x_RP = np.array_split(x_RP, len(x_RP)//365)
x_RS = np.array_split(x_RS, len(x_RS)//365)

#Concatenate and save
x = np.concatenate((x_RP, x_RS))
y = np.concatenate((np.zeros(len(x_RP)), np.ones(len(x_RS)))).astype('int')
clients = np.concatenate((CDC_RP.pom.unique(), CDC_RS.pom.unique()))

np.save('conso.npy', x)
np.save('labels.npy', y)
np.save('clients.npy', clients)

In [7]:
def extract_features(X, timestamp):
    dates = pd.DatetimeIndex(timestamp)
    feature_list = ['mean', 'std', 'max', 'min', 'mean_we', 'mean_wd']
    feature_list += ['mean_month_%i'%(i) for i in range(12)]
        
    nfeat = len(feature_list)
    feats = np.zeros((X.shape[0], nfeat))
    feats[:,0] = X.mean(axis=1)
    feats[:,1] = X.std(axis=1)
    feats[:,2] = X.max(axis=1)
    feats[:,3] = X.min(axis=1)
    feats[:,4] = np.mean(X[:, dates.weekday>=5], axis=1)
    feats[:,5] = np.mean(X[:, dates.weekday<5], axis=1)
    for i in range(12):
        j = i+6
        feats[:,j] = np.mean(X[:, dates.month==i+1], axis=1)
        
    return feats, feature_list

In [8]:
timestamp = [datetime.datetime(2021,1,1) + datetime.timedelta(days=x) for x in range(365)]
feats, feature_list = extract_features(x, timestamp)

In [9]:
np.save('feats.npy', feats)
np.save('feature_list.npy', feature_list)