## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
from statsmodels.tsa.seasonal import seasonal_decompose
# from sklearn import datasets, svm, preprocessing
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, classification_report
# from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

## Data loading

In [None]:
df = pd.read_csv(os.getcwd()+"/pre_processing/dataset_traite.csv", sep=',', parse_dates=["DateTime"],index_col=['DateTime'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

## Data viz

In [None]:
active_power_df = df[['Global_active_power']]
active_power_df = active_power_df.loc[active_power_df.index.year == 2007]
active_power_df

#### Average global active power (in kilowatt) per week in 2007

In [None]:
weekly_avg = active_power_df.resample("W").mean()
weekly_avg.plot(figsize=(12,6), title="Average global active power (in watt) per week in 2007")

#### Moving average of the daily average global active power (in watt) in 2007

In [None]:
daily_avg = active_power_df.resample("D").mean()

sma_length = 15
sma = daily_avg["Global_active_power"].rolling(window=sma_length).mean()
sma.plot(figsize=(12,6))
# sma

In [None]:
data2 = df.copy()
data2 = data2.resample('M').mean()
data2.fillna(data2.mean(), inplace=True)
seasonal_decompose(data2["Global_active_power"]).plot()

In [None]:
data = df.copy()
i = 1
cols=[0, 1, 3, 4, 5, 6]
plt.figure(figsize=(20, 10))
for col in cols:
    plt.subplot(len(cols), 1, i)
    plt.plot(data.resample('m').mean().values[:, col])
    plt.title(data.columns[col] + ' data resample over month for mean', y=0.75, loc='left')
    i += 1
plt.show()

In [None]:
#plot global et sub sur meme plot, remettre a echelle

linestyle_str = ['solid','dotted','dashed','-.','-','--',':','dashdot']

data = df.copy()
data['Date'] = pd.to_datetime(data.index.date)
temperatures = pd.read_csv("pre_processing/temperatures.csv", parse_dates=['Date'], index_col='Date')
temperatures['avg_t'] = (temperatures['max_t'] - temperatures['min_t'])/2
data = data.join(temperatures, how="left", on='Date')
data = data.drop(columns=['Date','max_t','min_t'])
data = data.resample('m').mean()

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(data)
data[data.columns] = scaler.transform(data[data.columns])
plt.figure(figsize=(30, 15))
cols=[0, 1, 2, 3, 4, 5, 6, 7]
for col in cols:
    plt.plot(data.values[:, col],linestyle=linestyle_str[col])
plt.legend(data.columns,loc=4, bbox_to_anchor=(0.5, 0., 0.5, 0.5),fontsize="large")
plt.show()

In [None]:
import seaborn as sns
fig,axs= plt.subplots(1,3,figsize=(30, 8))

dfm = df.resample('M').mean()
sns.heatmap(dfm.corr(), vmin=-1, vmax=1, annot=True, ax = axs[0])
axs[0].set_title('Monthly resampling', size=12)

dfd = df.resample('D').mean()
sns.heatmap(dfd.corr(), vmin=-1, vmax=1, annot=True, ax = axs[1])
axs[1].set_title('Day resampling', size=12)
  
dfh = df.resample('H').mean()
sns.heatmap(dfh.corr(), vmin=-1, vmax=1, annot=True, ax = axs[2])
axs[2].set_title('Hourly resampling', size=12)

fig.tight_layout()
fig.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(30,15))
fig.delaxes(axes[1,3])
for i in range(4):
    df[df.columns[i]].plot.hist(bins=100, alpha=1, ax=axes[0][i])
    axes[0][i].set_title(df.columns[i])
for i in range(4,7):
    df[df.columns[i]].plot.hist(bins=100, alpha=1, ax=axes[1][i-4])
    axes[1][i-4].set_title(df.columns[i])

#### Sub metering 1 (kitchen) per hour of the day

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

fig_sm1, axs = plt.subplots(2,2,figsize=(20, 15))

sub1_df = df[['Sub_metering_1']]
sub1_df = sub1_df.reset_index()

sub1_df_all = sub1_df.loc[sub1_df["DateTime"].dt.year == 2007]
sub1_df_all.set_index('DateTime', inplace=True)
(sub1_df_all.groupby(sub1_df_all.index.hour)["Sub_metering_1"].mean()).plot(kind='bar',ax=axs[0,0])
axs[0,0].set_title("Sub metering 1 per hour of a day")

sub1_df_bd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday != 5) & (sub1_df["DateTime"].dt.weekday != 6) & (sub1_df["DateTime"].dt.date.isin(holidays)==False)]
sub1_df_bd.set_index('DateTime', inplace=True)
(sub1_df_bd.groupby(sub1_df_bd.index.hour)["Sub_metering_1"].mean()).plot(kind='bar',ax=axs[0,1])
axs[0,1].set_title("Sub metering 1 per hour on a business day")

sub1_df_we = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday.isin([5,6]))]
sub1_df_we.set_index('DateTime', inplace=True)
(sub1_df_we.groupby(sub1_df_we.index.hour)["Sub_metering_1"].mean()).plot(kind='bar',ax=axs[1,0])
axs[1,0].set_title("Sub metering 1 per hour on the weekend")

sub1_df_hd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.date.isin(holidays))]
sub1_df_hd.set_index('DateTime', inplace=True)
(sub1_df_hd.groupby(sub1_df_hd.index.hour)["Sub_metering_1"].mean()).plot(kind='bar',ax=axs[1,1])
axs[1,1].set_title("Sub metering 1 per hour of a vacation day")

fig_sm1.show()

# add curves

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 1 per hour of a day","Sub metering 1 per hour on a business day","Sub metering 1 per hour on the weekend","Sub metering 1 per hour of a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub1_df = df[['Sub_metering_1']]
sub1_df = sub1_df.reset_index()

sub1_df_all = sub1_df.loc[sub1_df["DateTime"].dt.year == 2007]
sub1_df_all.set_index('DateTime', inplace=True)
plt.plot(sub1_df_all.groupby(sub1_df_all.index.hour)["Sub_metering_1"].mean(),linestyle='-')

sub1_df_bd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday != 5) & (sub1_df["DateTime"].dt.weekday != 6) & (sub1_df["DateTime"].dt.date.isin(holidays)==False)]
sub1_df_bd.set_index('DateTime', inplace=True)
plt.plot(sub1_df_bd.groupby(sub1_df_bd.index.hour)["Sub_metering_1"].mean(),linestyle='--')

sub1_df_we = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday.isin([5,6]))]
sub1_df_we.set_index('DateTime', inplace=True)
plt.plot(sub1_df_we.groupby(sub1_df_we.index.hour)["Sub_metering_1"].mean(),linestyle='-.')

sub1_df_hd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.date.isin(holidays))]
sub1_df_hd.set_index('DateTime', inplace=True)
plt.plot(sub1_df_hd.groupby(sub1_df_hd.index.hour)["Sub_metering_1"].mean(),linestyle=':')

plt.legend(curves,loc=2,fontsize="large")

plt.show()

# add curves

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 1 during a day","Sub metering 1 during a business day","Sub metering 1 during the weekend","Sub metering 1 during a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub1_df = df[['Sub_metering_1']]
sub1_df = sub1_df.reset_index()

sub1_df_all = sub1_df.loc[sub1_df["DateTime"].dt.year == 2007]
sub1_df_all.set_index('DateTime', inplace=True)
sub1_df_all = sub1_df_all.groupby([sub1_df_all.index.hour, sub1_df_all.index.minute]).mean()
sub1_df_all.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub1_df_all)

sub1_df_bd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday != 5) & (sub1_df["DateTime"].dt.weekday != 6) & (sub1_df["DateTime"].dt.date.isin(holidays)==False)]
sub1_df_bd.set_index('DateTime', inplace=True)
sub1_df_bd = sub1_df_bd.groupby([sub1_df_bd.index.hour, sub1_df_bd.index.minute]).mean()
sub1_df_bd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub1_df_bd)

sub1_df_we = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.weekday.isin([5,6]))]
sub1_df_we.set_index('DateTime', inplace=True)
sub1_df_we = sub1_df_we.groupby([sub1_df_we.index.hour, sub1_df_we.index.minute]).mean()
sub1_df_we.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub1_df_we)

sub1_df_hd = sub1_df.loc[(sub1_df["DateTime"].dt.year == 2007) & (sub1_df["DateTime"].dt.date.isin(holidays))]
sub1_df_hd.set_index('DateTime', inplace=True)
sub1_df_hd = sub1_df_hd.groupby([sub1_df_hd.index.hour, sub1_df_hd.index.minute]).mean()
sub1_df_hd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub1_df_hd)


plt.show()

#### Sub metering 2 per hour

In [None]:
fig_sm2, axs = plt.subplots(2,2,figsize=(20, 15))

sub2_df = df[['Sub_metering_2']]
sub2_df = sub2_df.reset_index()

sub2_df_all = sub2_df.loc[sub2_df["DateTime"].dt.year == 2007]
sub2_df_all.set_index('DateTime', inplace=True)
(sub2_df_all.groupby(sub2_df_all.index.hour)["Sub_metering_2"].mean()).plot(kind='bar',ax=axs[0,0])
axs[0,0].set_title("Sub metering 2 per hour of a day")

sub2_df_bd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday != 5) & (sub2_df["DateTime"].dt.weekday != 6) & (sub2_df["DateTime"].dt.date.isin(holidays)==False)]
sub2_df_bd.set_index('DateTime', inplace=True)
(sub2_df_bd.groupby(sub2_df_bd.index.hour)["Sub_metering_2"].mean()).plot(kind='bar',ax=axs[0,1])
axs[0,1].set_title("Sub metering 2 per hour on a business day")

sub2_df_we = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday.isin([5,6]))]
sub2_df_we.set_index('DateTime', inplace=True)
(sub2_df_we.groupby(sub2_df_we.index.hour)["Sub_metering_2"].mean()).plot(kind='bar',ax=axs[1,0])
axs[1,0].set_title("Sub metering 2 per hour on the weekend")

sub2_df_hd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.date.isin(holidays))]
sub2_df_hd.set_index('DateTime', inplace=True)
(sub2_df_hd.groupby(sub2_df_hd.index.hour)["Sub_metering_2"].mean()).plot(kind='bar',ax=axs[1,1])
axs[1,1].set_title("Sub metering 2 per hour of a vacation day")

fig_sm2.show()

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 2 per hour of a day","Sub metering 2 per hour on a business day","Sub metering 2 per hour on the weekend","Sub metering 2 per hour of a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub2_df = df[['Sub_metering_2']]
sub2_df = sub2_df.reset_index()

sub2_df_all = sub2_df.loc[sub2_df["DateTime"].dt.year == 2007]
sub2_df_all.set_index('DateTime', inplace=True)
plt.plot(sub2_df_all.groupby(sub2_df_all.index.hour)["Sub_metering_2"].mean(),linestyle='-')

sub2_df_bd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday != 5) & (sub2_df["DateTime"].dt.weekday != 6) & (sub2_df["DateTime"].dt.date.isin(holidays)==False)]
sub2_df_bd.set_index('DateTime', inplace=True)
plt.plot(sub2_df_bd.groupby(sub2_df_bd.index.hour)["Sub_metering_2"].mean(),linestyle='--')

sub2_df_we = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday.isin([5,6]))]
sub2_df_we.set_index('DateTime', inplace=True)
plt.plot(sub2_df_we.groupby(sub2_df_we.index.hour)["Sub_metering_2"].mean(),linestyle='-.')

sub2_df_hd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.date.isin(holidays))]
sub2_df_hd.set_index('DateTime', inplace=True)
plt.plot(sub2_df_hd.groupby(sub2_df_hd.index.hour)["Sub_metering_2"].mean(),linestyle=':')

plt.legend(curves,loc=2,fontsize="large")

plt.show()

# add curves

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 2 during a day","Sub metering 2 during a business day","Sub metering 2 during the weekend","Sub metering 2 during a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub2_df = df[['Sub_metering_2']]
sub2_df = sub2_df.reset_index()

sub2_df_all = sub2_df.loc[sub2_df["DateTime"].dt.year == 2007]
sub2_df_all.set_index('DateTime', inplace=True)
sub2_df_all = sub2_df_all.groupby([sub2_df_all.index.hour, sub2_df_all.index.minute]).mean()
sub2_df_all.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub2_df_all)

sub2_df_bd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday != 5) & (sub2_df["DateTime"].dt.weekday != 6) & (sub2_df["DateTime"].dt.date.isin(holidays)==False)]
sub2_df_bd.set_index('DateTime', inplace=True)
sub2_df_bd = sub2_df_bd.groupby([sub2_df_bd.index.hour, sub2_df_bd.index.minute]).mean()
sub2_df_bd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub2_df_bd)

sub2_df_we = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.weekday.isin([5,6]))]
sub2_df_we.set_index('DateTime', inplace=True)
sub2_df_we = sub2_df_we.groupby([sub2_df_we.index.hour, sub2_df_we.index.minute]).mean()
sub2_df_we.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub2_df_we)

sub2_df_hd = sub2_df.loc[(sub2_df["DateTime"].dt.year == 2007) & (sub2_df["DateTime"].dt.date.isin(holidays))]
sub2_df_hd.set_index('DateTime', inplace=True)
sub2_df_hd = sub2_df_hd.groupby([sub2_df_hd.index.hour, sub2_df_hd.index.minute]).mean()
sub2_df_hd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub2_df_hd)

plt.legend(curves,loc=2,fontsize="large")

plt.show()

#### Sub metering 3 per hour

In [None]:
fig_sm3, axs = plt.subplots(2,2,figsize=(20, 15))

sub3_df = df[['Sub_metering_3']]
sub3_df = sub3_df.reset_index()

sub3_df_all = sub3_df.loc[sub3_df["DateTime"].dt.year == 2007]
sub3_df_all.set_index('DateTime', inplace=True)
(sub3_df_all.groupby(sub3_df_all.index.hour)["Sub_metering_3"].mean()).plot(kind='bar',ax=axs[0,0])
axs[0,0].set_title("Sub metering 3 per hour of a day")

sub3_df_bd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday != 5) & (sub3_df["DateTime"].dt.weekday != 6) & (sub3_df["DateTime"].dt.date.isin(holidays)==False)]
sub3_df_bd.set_index('DateTime', inplace=True)
(sub3_df_bd.groupby(sub3_df_bd.index.hour)["Sub_metering_3"].mean()).plot(kind='bar',ax=axs[0,1])
axs[0,1].set_title("Sub metering 3 per hour on a business day")

sub3_df_we = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday.isin([5,6]))]
sub3_df_we.set_index('DateTime', inplace=True)
(sub3_df_we.groupby(sub3_df_we.index.hour)["Sub_metering_3"].mean()).plot(kind='bar',ax=axs[1,0])
axs[1,0].set_title("Sub metering 3 per hour on the weekend")

sub3_df_hd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.date.isin(holidays))]
sub3_df_hd.set_index('DateTime', inplace=True)
(sub3_df_hd.groupby(sub3_df_hd.index.hour)["Sub_metering_3"].mean()).plot(kind='bar',ax=axs[1,1])
axs[1,1].set_title("Sub metering 3 per hour of a vacation day")

fig_sm3.show()

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 3 per hour of a day","Sub metering 3 per hour on a business day","Sub metering 3 per hour on the weekend","Sub metering 3 per hour of a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub3_df = df[['Sub_metering_3']]
sub3_df = sub3_df.reset_index()

sub3_df_all = sub3_df.loc[sub3_df["DateTime"].dt.year == 2007]
sub3_df_all.set_index('DateTime', inplace=True)
plt.plot(sub3_df_all.groupby(sub3_df_all.index.hour)["Sub_metering_3"].mean(),linestyle='-')

sub3_df_bd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday != 5) & (sub3_df["DateTime"].dt.weekday != 6) & (sub3_df["DateTime"].dt.date.isin(holidays)==False)]
sub3_df_bd.set_index('DateTime', inplace=True)
plt.plot(sub3_df_bd.groupby(sub3_df_bd.index.hour)["Sub_metering_3"].mean(),linestyle='--')

sub3_df_we = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday.isin([5,6]))]
sub3_df_we.set_index('DateTime', inplace=True)
plt.plot(sub3_df_we.groupby(sub3_df_we.index.hour)["Sub_metering_3"].mean(),linestyle='-.')

sub3_df_hd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.date.isin(holidays))]
sub3_df_hd.set_index('DateTime', inplace=True)
plt.plot(sub3_df_hd.groupby(sub3_df_hd.index.hour)["Sub_metering_3"].mean(),linestyle=':')

plt.legend(curves,loc=2,fontsize="large")

plt.show()

# add curves

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
curves = ["Sub metering 3 during a day","Sub metering 3 during a business day","Sub metering 3 during the weekend","Sub metering 3 during a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub3_df = df[['Sub_metering_3']]
sub3_df = sub3_df.reset_index()

sub3_df_all = sub3_df.loc[sub3_df["DateTime"].dt.year == 2007]
sub3_df_all.set_index('DateTime', inplace=True)
sub3_df_all = sub3_df_all.groupby([sub3_df_all.index.hour, sub3_df_all.index.minute]).mean()
sub3_df_all.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub3_df_all)

sub3_df_bd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday != 5) & (sub3_df["DateTime"].dt.weekday != 6) & (sub3_df["DateTime"].dt.date.isin(holidays)==False)]
sub3_df_bd.set_index('DateTime', inplace=True)
sub3_df_bd = sub3_df_bd.groupby([sub3_df_bd.index.hour, sub3_df_bd.index.minute]).mean()
sub3_df_bd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub3_df_bd)

sub3_df_we = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.weekday.isin([5,6]))]
sub3_df_we.set_index('DateTime', inplace=True)
sub3_df_we = sub3_df_we.groupby([sub3_df_we.index.hour, sub3_df_we.index.minute]).mean()
sub3_df_we.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub3_df_we)

sub3_df_hd = sub3_df.loc[(sub3_df["DateTime"].dt.year == 2007) & (sub3_df["DateTime"].dt.date.isin(holidays))]
sub3_df_hd.set_index('DateTime', inplace=True)
sub3_df_hd = sub3_df_hd.groupby([sub3_df_hd.index.hour, sub3_df_hd.index.minute]).mean()
sub3_df_hd.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub3_df_hd)

plt.legend(curves,loc=2,fontsize="large")

plt.show()

# add curves

In [None]:
from vacances_scolaires_france import SchoolHolidayDates
import datetime
import math
curves = ["Sub metering 3 per hour of a day","Sub metering 3 per hour on a business day","Sub metering 3 per hour on the weekend","Sub metering 3 per hour of a vacation day"]
d = SchoolHolidayDates()
dict_holidays = d.holidays_for_year_and_zone(2007, 'C')
holidays = dict_holidays.keys()

plt.figure(figsize=(30, 15))

sub3_df = df[['Sub_metering_3']]
sub3_df = sub3_df.reset_index()

sub3_df_all = sub3_df.loc[sub3_df["DateTime"].dt.year == 2007]
sub3_df_all.set_index('DateTime', inplace=True)
sub3_df_all = sub3_df_all.groupby([sub3_df_all.index.hour, sub3_df_all.index.minute]).mean()
sub3_df_all.set_index(pd.date_range("00:00","23:59", freq="1min").time,inplace=True)
plt.plot(sub3_df_all)

#### Sub metering 1 per minute

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))
axs.set_ylim([0, 4.5])
sub1_df_all.groupby([sub1_df_all.index.hour, sub1_df_all.index.minute])["Sub_metering_1"].mean()[8].plot(kind='bar', rot=0, ax=axs)

#### Sub metering 1 per minute on business days

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))
axs.set_ylim([0, 4.5])
sub1_df_bd.groupby([sub1_df_bd.index.hour, sub1_df_bd.index.minute])["Sub_metering_1"].mean()[8].plot(kind='bar', rot=0, ax=axs)

#### Sub metering 1 per minute on weekends

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))
axs.set_ylim([0, 4.5])
sub1_df_we.groupby([sub1_df_we.index.hour, sub1_df_we.index.minute])["Sub_metering_1"].mean()[8].plot(kind='bar', rot=0, ax=axs)

#### Sub metering 1 per minute on holidays

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))
axs.set_ylim([0, 4.5])
sub1_df_hd.groupby([sub1_df_hd.index.hour, sub1_df_hd.index.minute])["Sub_metering_1"].mean()[8].plot(kind='bar', rot=0, ax=axs)

In [None]:
import matplotlib
matplotlib.style.use('default')
group2 = sub1_df_bd.groupby([sub1_df_bd.index.hour, sub1_df_bd.index.minute])["Sub_metering_1"].mean().unstack().transpose()
group2.plot(kind="bar", stacked=True,figsize=(20,20))


#### work in progress

In [None]:
sub1_df_bd_march = sub1_df_bd.loc[(sub1_df["DateTime"].dt.month == 3)]
cumul

# fig, axs = plt.subplots(figsize=(20, 4))
# axs.set_ylim([0, 13])
# sub1_df_bd.groupby([sub1_df_bd.index.hour, sub1_df_bd.index.minute])["Sub_metering_1"].mean()[7].plot(kind='bar', rot=0, ax=axs)

Interpretation of the plot :
Sharp increase of power consumption around 8am, implies that the famsilly wakes up around 8 on week days

#### Sub metering 1 per hour of the weekend days

### partie de theophile

In [None]:
df_copy2 = df.copy().loc[df["DateTime"].dt.year == 2008]
df_copy2.index = pd.to_datetime(df_copy2['DateTime'])
df_copy2 = df_copy2.drop(['DateTime'], axis=1)
daily_avg = df_copy2.resample("D").mean()


In [None]:
temp_2008 = pd.read_csv("pre_processing/temperatures_2008.csv", sep=',', parse_dates=["Date"])
temp_2008 = temp_2008.set_index(daily_avg.index)
temp_2008 = temp_2008.drop(['Date'], axis = 1)
temp_2008.head()

In [None]:
print(daily_avg.shape)
print(temp_2008.shape)

In [None]:
daily_avg['min_temp'] = temp_2008['min_t']
daily_avg['max_temp'] = temp_2008['max_t']
daily_avg.head(100)

In [None]:
x = (daily_avg['max_temp']+daily_avg['min_temp'])/2
y = daily_avg['Global_active_power']

m, b = np.polyfit(x, y, 1)
print('Global electic consumption = {}*x + {}'.format(m,b)) 

plt.plot(x,y, linestyle='None',marker='o')
plt.plot(x, m*x+b)

In [None]:
x = (daily_avg['max_temp']+daily_avg['min_temp'])/2
y = (daily_avg['Global_active_power']*1000/60)-daily_avg['Sub_metering_1']-daily_avg['Sub_metering_2']-daily_avg['Sub_metering_3']

m, b = np.polyfit(x, y, 1)
print('active energy of non measured electic devices = {}*x + {}'.format(m,b)) 

plt.plot(x,y, linestyle='None',marker='o')
plt.plot(x, m*x+b)

In [None]:
x = (daily_avg['max_temp']+daily_avg['min_temp'])/2
y = (daily_avg['Global_active_power']*1000/60)-daily_avg['Sub_metering_1']-daily_avg['Sub_metering_2']

m, b = np.polyfit(x, y, 1)
print('active energy of non measured electic devices + sub3= {}*x + {}'.format(m,b)) 

plt.plot(x,y, linestyle='None',marker='o')
plt.plot(x, m*x+b)

In [None]:
def subplot_coord(x,y,param="all"):
    tuples =[]
    if param=="all":
        for i in range(x+1):
            for j in range(y+1):
                tuples.append((i,j))
    elif param=="line":
        for j in range(y+1):
                tuples.append((x,j))
    elif param=="line":
        for i in range(x+1):
                tuples.append((i,y))
    return tuples  

In [None]:
fig, axis = plt.subplots(2,4,figsize=(20, 10))
fig.delaxes(axis[1,3])
indexes = [(0,0),(0,1),(0,2),(0,3),(1,0),(1,1),(1,2)]
variables = list(df.columns)
variables.remove('DateTime')
data_bd = df.loc[(df["DateTime"].dt.weekday != 5) & (df["DateTime"].dt.weekday !=6) & (df["DateTime"].dt.date.isin(holidays)==False)]
data_we = df.loc[(df["DateTime"].dt.weekday == 5) | (df["DateTime"].dt.weekday ==6)]
data_hd = df.loc[(df["DateTime"].dt.date.isin(holidays))]


for idx, val in enumerate(indexes):
    temp_bd=data_bd[['DateTime',variables[idx]]]
    temp_we=data_we[['DateTime',variables[idx]]]
    temp_hd=data_hd[['DateTime',variables[idx]]]
    axis[val[0],val[1]].plot(temp_bd.groupby(temp_bd['DateTime'].dt.hour)[variables[idx]].mean())
    axis[val[0],val[1]].plot(temp_we.groupby(temp_we['DateTime'].dt.hour)[variables[idx]].mean())
    axis[val[0],val[1]].plot(temp_hd.groupby(temp_hd['DateTime'].dt.hour)[variables[idx]].mean())
    axis[val[0],val[1]].set_title(variables[idx])
        
fig.tight_layout(pad=3.0)
fig.legend(['Business days', 'Weekends', 'Holidays'],loc=4, bbox_to_anchor=(0.5, 0., 0.5, 0.5),fontsize="large")
fig.show()

In [None]:
fig, axis = plt.subplots(24,7,figsize=(25, 72))

hour = 0
variables = list(df.columns)
variables.remove('DateTime')
data_bd = df.loc[(df["DateTime"].dt.weekday != 5) & (df["DateTime"].dt.weekday !=6) & (df["DateTime"].dt.date.isin(holidays)==False)]
data_we = df.loc[(df["DateTime"].dt.weekday == 5) | (df["DateTime"].dt.weekday ==6)]
data_hd = df.loc[(df["DateTime"].dt.date.isin(holidays))]

for i in range(24):
    indexes = subplot_coord(i,6,"line")
    for idx, val in enumerate(indexes):
        temp_bd=data_bd[['DateTime',variables[idx]]].loc[(data_bd["DateTime"].dt.hour ==i)]
        temp_we=data_we[['DateTime',variables[idx]]].loc[(data_we["DateTime"].dt.hour ==i)]
        temp_hd=data_hd[['DateTime',variables[idx]]].loc[(data_hd["DateTime"].dt.hour ==i)]
        axis[val[0],val[1]].plot(temp_bd.groupby(temp_bd['DateTime'].dt.minute)[variables[idx]].mean())
        axis[val[0],val[1]].plot(temp_we.groupby(temp_we['DateTime'].dt.minute)[variables[idx]].mean())
        axis[val[0],val[1]].plot(temp_hd.groupby(temp_hd['DateTime'].dt.minute)[variables[idx]].mean())
        if val[0]==0:
            axis[val[0],val[1]].set_title(variables[idx], fontsize=10)
        if val[1]==0:
            axis[val[0],val[1]].set_title("{0}h-{1}h:".format(i,i+1), loc="left", fontsize=12)

fig.tight_layout(pad=3.0)
fig.legend(['Business days', 'Weekends', 'Holidays'],loc=4, bbox_to_anchor=(0.5, 0., 0.5, 0.5),fontsize="large")
fig.show()