In [None]:
import os
import warnings                                  # `do not disturbe` mode
warnings.filterwarnings('ignore')

import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt                  # plots
                         # more plots

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from statsmodels.tsa.seasonal import seasonal_decompose

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook

import seaborn as sns
sns.set_style(
    style='whitegrid', 
    rc={'axes.facecolor': '.95', 'grid.color': '.95'}
)

import tensorflow as tf
tf.random.set_seed(2)

from tensorflow import keras

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, Flatten, GRU

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100

os.environ['PYTHONHASHSEED'] = str(2)
np.random.seed(2)

In [None]:
df = pd.read_csv('output.csv')
df.head()

In [None]:
df.tail()

In [None]:
#Find missing data!
pd.date_range('2015-01-06 15:00:00', '2015-12-30 00:25:00', freq='5Min').difference(pd.to_datetime(df["time"]))

In [None]:
df.isna().sum()

In [None]:
df["time"]= pd.to_datetime(df["time"])
df.set_index(["time"], inplace=True)
df.head()

In [None]:
df.drop(['Toffice_reference', 'humidity', 'detected_motions', 'occupancy', 'office_CO2_concentratio','door'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df[df['label']<0] = 0

In [None]:
df[df['label']<0]

In [None]:
data_per_5min = df
data_per_Hour = df.resample('H').sum()
data_per_5min.shape, data_per_Hour.shape

In [None]:
df.hist(bins=50)
plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(20, 15))
sns.boxplot(data=df, y='label')



In [None]:
fig, ax = plt.subplots(figsize=(32,20))

plt.subplot(2, 1, 1)
plt.plot(data_per_5min.loc['2015-02-01']['label'])
#plt.plot(data_per_5min['occupancy'])
#plt.plot(data_per_H.loc['2019-05-01':'2019-05-07',:])
plt.title('per 5 min for random days')


plt.subplot(2, 1, 2)
plt.plot(data_per_Hour['2015-02-01']['label'])
#plt.plot(data_per_D.loc['2019-05',:])
plt.title('per hour for a random days')



plt.show()
fig.tight_layout()

In [None]:
# fig = plt.figure(figsize=(20, 15))
# ax1 = fig.add_subplot(211)
# sns.boxplot(data=data, x='month', y='label', ax=ax1)
# ax2 = fig.add_subplot(212)
# sns.boxplot(data=data, x='weekday', y='label', ax=ax2)
# # ax3 = fig.add_subplot(313)
# # sns.boxplot(data=df, x='holiday', y='label', ax=ax3)
# plt.show()

In [None]:
data = data_per_Hour[:]

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(data['label'])
plot_pacf(data['label'])

In [None]:
data.head()

# add simple cal features

In [None]:
import holidays

def add_time_features(df):
    cet_index = df.index
    df["month"] = cet_index.month
    df["weekday"] = cet_index.weekday
    df["hour"] = cet_index.hour
    #df["year"] = cet_index.year
    return df

def add_holiday_features(df):
    de_holidays = holidays.France()
    cet_dates = pd.Series(df.index, index=df.index)
    df["holiday"] = cet_dates.apply(lambda d: d in de_holidays)
    df["holiday"] = df["holiday"].astype(int)
    return df


def add_all_features(df, target_col="conso_global"):
    df = df.copy()
    df = add_time_features(df)
    #df = add_holiday_features(df)
    return df

In [None]:
data = add_all_features(data)

In [None]:
data.head()

## Importing the calendar

In [None]:
from icalendar import Calendar, Event
from datetime import datetime

In [None]:
# evenement = []
# debut = []
# fin =[]


# g = open('stephane_stephane.ploix@gmail.com.ics','rb')
# gcal = Calendar.from_ical(g.read().decode())
# for component in gcal.walk():
#     if component.name == "VEVENT":

#         evenement.append(str((component.get('summary'))))
#         if len(str(component.get('dtstart').dt)) >12:
#             debut.append(datetime.strptime(str(component.get('dtstart').dt)[:-6],'%Y-%m-%d %H:%M:%S'))
#         else:
#             debut.append(datetime.strptime(str(component.get('dtstart').dt), '%Y-%m-%d'))
#         if component.get('dtend') is not None:
#             fin.append(component.get('dtend').dt)
#         else:
#             fin.append("Nan")

# g.close()

# calendrier = pd.DataFrame({'evenement': evenement,'debut':debut,'fin':fin})
# calendrier['debut'] =pd.to_datetime(calendrier.debut)
# calendrier.sort_values(['debut'], inplace=True)
# calendrier = calendrier.set_index(calendrier['debut'])
# calendrier = calendrier['2015-01-04':'2015-12-31']


# plt.figure(figsize=(20,9))
# calendrier.evenement.value_counts()[0:100].plot.bar()
# plt.show()

# label = []
# for k in calendrier.index:
#     if "point" in calendrier['evenement'].loc[str(k)]:
#         label.append(2)
#     else:
#         label.append(1)

# calendrier['label']=label
# print(calendrier.head())



# cal = []
# nom = []
# for k in data['label']:
#     cal.append(0)
#     nom.append("None")

# data['calendrier'] = cal
# data['nom']=nom

# calendrier.drop_duplicates(subset ="debut",
#                      keep = False, inplace = True)
# print(calendrier[calendrier.index.duplicated()])
# print("fin test")

# for k in calendrier.index:
#     if k in data.index:
#         data['calendrier'].loc[str(k)] = calendrier['label'].loc[str(k)]
#         data['nom'].loc[str(k)] = calendrier['evenement'].loc[str(k)]

# # plt.figure(figsize=(20,9))
# # data['label'].plot()
# # data['calendrier'].plot()
# # plt.show()

In [None]:
data.head()

In [None]:
# data.drop(['nom'], axis =1, inplace=True)

In [None]:
# fig = plt.figure(figsize=(20, 15))
# ax1 = fig.add_subplot(211)
# sns.boxplot(data=data, x='calendrier', y='label', ax=ax1)
# plt.show()

In [None]:
import numpy as np 
np.arange(1)

# window 1h: 

In [None]:
# LEADS
leads = np.arange(1)

hour_leads = [f"hour_lead_{lead+1}" for lead in leads ]
weekday_leads = [f"weekday_lead_{lead+1}" for lead in leads ]
month_leads = [f"month_lead_{lead+1}" for lead in leads ]
# cal_leads = [f"cal_lead_{lead+1}" for lead in leads ]


for lead, lead_H in zip(leads, hour_leads):
    data[lead_H] = data["hour"].shift(-(lead+1))
    
for lead, lead_W in zip(leads, weekday_leads):
    data[lead_W] = data["weekday"].shift(-(lead+1))
    
for lead, lead_M in zip(leads, month_leads):
    data[lead_M] = data["month"].shift(-(lead+1))
    
# for lead, lead_C in zip(leads, cal_leads):
#     data[lead_C] = data["calendrier"].shift(-(lead+1))    
    


In [None]:
#LAGS
lags = np.arange(1,25)
lag_cols = [f"label_lag_{lag}" for lag in lags ]
for lag, lag_col in zip(lags, lag_cols):
    data[lag_col] = data["label"].shift(lag)
    
hour_lags = [f"hour_lag_{lag}" for lag in lags ]
weekday_lags = [f"weekday_lag_{lag}" for lag in lags ]
month_lags = [f"month_lag_{lag}" for lag in lags ]
# cal_lags = [f"cal_lag_{lag}" for lag in lags ]


for lag, lag_H in zip(lags, hour_lags):
    data[lag_H] = data["hour"].shift(lag)
    
for lag, lag_W in zip(lags, weekday_lags):
    data[lag_W] = data["weekday"].shift(lag)
    
for lag, lag_M in zip(lags, month_lags):
    data[lag_M] = data["month"].shift(lag)

# for lag, lag_C in zip(lags, cal_lags):
#     data[lag_C] = data["calendrier"].shift(lag)
    


        

In [None]:
#Rolling 

wins = [3, 6, 12, 24, 36, 48]
for win in wins:
    for lag,lag_col in zip(lags, lag_cols):
        data[f"rmean_{lag}_{win}"] = data[lag_col].transform(lambda x : x.rolling(win).mean())
#         data[f"rmax_{lag}_{win}"] = data["label"].shift(lag).transform(lambda x : x.rolling(win).max())
#         data[f"rmin_{lag}_{win}"] = data["label"].shift(lag).transform(lambda x : x.rolling(win).min())
        data[f"rstd_{lag}_{win}"] = data[lag_col].transform(lambda x : x.rolling(win).std())

In [None]:
data.drop(["label_lag_1", "label_lag_2"], axis =1, inplace=True)

In [None]:
data.rename(columns={"label": "label_lead_0"}, inplace= True)

In [None]:
data.tail()

In [None]:
data.dropna(inplace=True)

In [None]:
data.tail()

In [None]:
#Scaling
scaler = MinMaxScaler()
data['label_lead_0'] = scaler.fit_transform(pd.DataFrame(data['label_lead_0']))
   

lag_cols = [f"label_lag_{lag}" for lag in lags[2:] ]
for lag_col in lag_cols:
    data[lag_col] = scaler.transform(pd.DataFrame(data[lag_col]))

    
for win in wins:
    for lag,lag_col in zip(lags, lag_cols):
        data[f"rmean_{lag}_{win}"] = scaler.transform(pd.DataFrame(data[f"rmean_{lag}_{win}"]))
#         data[f"rmax_{lag}_{win}"] = scaler.transform(pd.DataFrame(data[f"rmax_{lag}_{win}"]))
#         data[f"rmin_{lag}_{win}"] = scaler.transform(pd.DataFrame(data[f"rmin_{lag}_{win}"]))
        data[f"rstd_{lag}_{win}"] = scaler.transform(pd.DataFrame(data[f"rstd_{lag}_{win}"]))

In [None]:
data

In [None]:
8506  *0.75

In [None]:
train_data = data.iloc[0:6380]
test_data = data.iloc[6380:]

train_data.shape, test_data.shape

In [None]:
lead_cols = ['label_lead_0']

In [None]:
x_train_data = train_data.drop(lead_cols, axis=1)
y_train_data = train_data.loc[:, lead_cols]
x_test_data = test_data.drop(lead_cols, axis=1)
y_test_data = test_data.loc[:, lead_cols]

In [None]:
y_train_data

In [None]:
import lightgbm as lgbm
from sklearn import metrics
from sklearn import model_selection

In [None]:
threshold = -2
features = []
for col in y_train_data.columns:
    correlation = pd.DataFrame(train_data.drop(lead_cols, axis=1).corrwith(train_data[col]), columns=['corr'])
    thresh_corr = correlation[correlation['corr']> threshold]
    features.append(list(thresh_corr.index))

In [None]:
param = {
    "objective":'regression',
    "is_unbalance":True,
    'learning_rate': 0.024398663784197132, 'max_depth': 5, 'num_leaves': 213, 'min_child_samples': 41
            }


In [None]:
from tqdm import tqdm

result = pd.DataFrame()
models = dict()
for counter, col in enumerate(tqdm(y_train_data.columns)):
    
    result[f'pred_{counter}'] = np.zeros(y_test_data.shape[0])
    
    used_features = features[counter]
    
    x_train_local = x_train_data.loc[:, used_features]
    y_train_local = y_train_data.loc[:, col]
    x_test_local = x_test_data.loc[:, used_features]
    y_test_local = y_test_data.loc[:, col]
    
    
    model = lgbm.LGBMRegressor(
                    learning_rate= param["learning_rate"],
                    max_depth= param["max_depth"],
                    num_leaves= param["num_leaves"],
                    objective= param["objective"],
                    is_unbalance=param["is_unbalance"],
                    min_child_samples=param['min_child_samples']

    )

    n_splits = 6
    cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)

    val_scores = [0] * n_splits




    #feature_importances = pd.DataFrame(index=x_train_local.columns)

    for i, (fit_idx, val_idx) in enumerate(cv.split(x_train_local, y_train_local)):

        X_fit = x_train_local.iloc[fit_idx]
        y_fit = y_train_local.iloc[fit_idx]
        X_val = x_train_local.iloc[val_idx]
        y_val = y_train_local.iloc[val_idx]

        model.fit(
            X_fit,
            y_fit,
            eval_set=[(X_fit, y_fit), (X_val, y_val)],
            eval_names=('fit', 'val'),
            eval_metric='l2',
            early_stopping_rounds=200,
            feature_name=X_fit.columns.tolist(),
            verbose=False
        )

        val_scores[i] = np.sqrt(model.best_score_['val']['l2'])
        result[f'pred_{counter}'] += model.predict(x_test_local, num_iteration=model.best_iteration_)
        #feature_importances[i] = model.feature_importances_

        print('Fold {} RMSLE: {:.5f}'.format(i+1, val_scores[i]))

    result[f'pred_{counter}'] /= n_splits
    result[f'pred_{counter}'] = np.expm1(result[f'pred_{counter}'])

    val_mean = np.mean(val_scores)
    val_std = np.std(val_scores)

    print('Local RMSLE: {:.5f} (Â±{:.5f})'.format(val_mean, val_std))
    models[col] = model

In [None]:
result

In [None]:
final_test = y_test_data.values.flatten()
final_resut = result.values.flatten()

In [None]:
plt.figure(figsize=(18,5))

plt.plot(final_test[330:500], marker='.',label='Target')
plt.plot(final_resut[330:500], marker='.', label='Predictions')
plt.title('Elec consumption')
plt.xlabel('Time(h)')
plt.ylabel('Elec consump (khw)')
plt.legend()
plt.show()

In [None]:
print( "The RMSE for Stacked LSTM model is: %f" %np.sqrt(mean_squared_error(final_test,final_resut)))
print( "The accuracy of Stacked LSTM model is: %f" %r2_score(final_test,final_resut))