In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

### Data reading

In [None]:
df = pd.read_csv(os.path.join(dirname, filename), low_memory=False)
df.head()

In [None]:
df.columns

### Data Preprocessing

In [None]:
# Rename columns to remove spaces and the kW unit 
df.columns = [col[:-5].replace(' ','_') if 'kW' in col else col for col in df.columns]

# Drop rows with nan values 
df = df.dropna()

# The columns "use" and "house_overall" are the same, so let's remove the 'house_overall' column
df.drop(['House_overall'], axis=1, inplace=True)

# The columns "gen" and "solar" are the same, so let's remove the 'solar' column
df.drop(['Solar'], axis=1, inplace=True)

# drop rows with cloudCover column values that are not numeric (bug in sensors) and convert column to numeric
df = df[df['cloudCover']!='cloudCover']
df["cloudCover"] = pd.to_numeric(df["cloudCover"])

# Create columns that regroup kitchens and furnaces 
df['kitchen'] = df['Kitchen_12'] + df['Kitchen_14'] + df['Kitchen_38']
df['Furnace'] = df['Furnace_1'] + df['Furnace_2']

# Convert "time" column (which is a unix timestamp) to a Y-m-d H-M-S 
import time 
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(df['time'].iloc[0])))
time_index = pd.date_range(start_time, periods=len(df), freq='min')  
time_index = pd.DatetimeIndex(time_index)
df = df.set_index(time_index)
df = df.drop(['time'], axis=1)

### Data Analysis

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# lower frist letter of a string  
func = lambda s: s[:1].lower() + s[1:] if s else ''

In [None]:
cols = list(df.dtypes.keys())
categ_cols = [col for col in cols if df[col].dtype=='O']
num_cols = [col for col in cols if col not in categ_cols]
print('categ_cols : ', categ_cols)
print('num_cols : ', num_cols)

In [None]:
# Let's remove rows with values that appear less than a certain percentage %

def remove_less_percent(col, percent):
    keys_to_conserve = [key for key,value in df[col].value_counts(normalize=True).items() if value>=percent]
    return df[df[col].isin(keys_to_conserve)]

print(len(df))
df = remove_less_percent('summary', 0.05)
print(len(df))
df = remove_less_percent('icon', 0.05)
print(len(df))

In [None]:
# plot bars of unique values of categorical columns

def plot_bars(col):
    
    import matplotlib.pyplot as plt 
    from matplotlib.pyplot import figure

    figure(figsize=(14, 8), dpi=80)
    plt.xticks(rotation = 90)
    
    D = df[col].value_counts(normalize=True).to_dict()

    plt.bar(*zip(*D.items()))
    plt.show()

In [None]:
plot_bars('icon')

In [None]:
plot_bars('summary')

In [None]:
df['use'].resample(rule='D').mean().plot(figsize=(25,5))

In [None]:
df['temperature'].resample(rule='D').mean().plot(figsize=(25,5))

In [None]:
df['cloudCover'].resample(rule='D').mean().plot(figsize=(25,5))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

general_energy_cols = ['gen', 'use']
general_energy_per_month = df[general_energy_cols].resample('M').sum() # for energy we use sum to calculate overall consumption in period 

plt.figure(figsize=(20,10))

sns.lineplot(data=general_energy_per_month, dashes=False)

In [None]:
rooms_energy_cols = ['Home_office', 'Wine_cellar','Garage_door',
                       'kitchen', 'Barn', 'Well','Living_room']

rooms_energy_per_month = df[rooms_energy_cols].resample('M').mean()   

plt.figure(figsize=(20,8))

sns.lineplot(data=rooms_energy_per_month, dashes=False)

* The energy consumption of kitchen, garage and the well remained almost the same throughout the year
* There's seasonality of energy consumption in other parts of the house :
    * A clear spike in september in the energy consumed by the wine cellar and the home office
    * A clear downtrend in the summer for the barn energy consumption

In [None]:
equipements_cols = ['Microwave', 'Dishwasher', 'Furnace', 'Fridge'] 

equipements_energy_per_month = df[equipements_cols].resample('M').mean()   

plt.figure(figsize=(25,8))

sns.lineplot(data= equipements_energy_per_month, dashes=False)

The usage of the furnace decreases in the summer

In [None]:
weather_columns = ['temperature','humidity', 'visibility', 'apparentTemperature', 
                   'windSpeed', 'dewPoint']

weather_per_month = df[weather_columns].resample('M').mean()   

plt.figure(figsize=(25,8))

sns.lineplot(data=weather_per_month, dashes=False)

In [None]:
fig,ax = plt.subplots(figsize=(20, 18)) 
corr = df[weather_columns].corr()
sns.heatmap(corr, annot=True, vmin=-1.0, vmax=1.0, center=0)
ax.set_title('Correlation of Weather Information', size=20)
plt.show()

----

# Modeling : What are we trying to solve ?

* Case 1 Change Detection : Detecting excessive energy consumption in advance and preventing increase in usage fees.
* Case 2 Predict Future Consumption : Predicting future energy consumption and generation by utilizing weather information and optimizing energy supply.

    [**Inspired by kohei-mu**](https://www.kaggle.com/koheimuramatsu/change-detection-forecasting-in-smart-home#6.-Modeling)

#### Case 1 : Change detection

The change point is the point at which the trends in time series data change over time.
Outliers indicate a momentary abnormal condition (rapid decrease or increase), while change points mean that the abnormal condition does not return to its original state and continue.

Let's use ChangeFinder algorithm 

ChangeFinder is an algorithm used to detect change points.
ChangeFinder uses the log-likelihood based on the SDAR(Sequencially Discounting AR) algorithm to calculate the change score.
SDAR algorithm introduces a discounting parameter into the AR algorithm to reduce the influence of past data, so that even non-stationary time series data can be learned robustly.

ChangeFinder has two steps of model training:
* Training STEP1
Train a time series model at each data point using the SDAR algorithm
Based on the trained time series model, calculate the likelihood that the data points at the next time point will appear
Calculate the logarithmic loss and use it as an outlier score

    𝑆𝑐𝑜𝑟𝑒(𝑥𝑡)=−𝑙𝑜𝑔𝑃𝑡−1(𝑥𝑡|𝑥1,𝑥2,…,𝑥𝑡−1)
 
Smoothing Step
Smooth the outlier score within the smoothing window( 𝑊 ).
By smoothing, the score due to outliers is attenuated, and it is possible to determine whether the abnormal condition has continued for a long time.

    𝑆𝑐𝑜𝑟𝑒_𝑠𝑚𝑜𝑜𝑡ℎ𝑒𝑑(𝑥𝑡)=1𝑊∑𝑡=𝑡−𝑊+1𝑡𝑆𝑐𝑜𝑟𝑒(𝑥𝑖)
 
* Training STEP2
Using the score obtained by smoothing, train the model with the SDAR algorithm
Based on the trained time series model, calculate the likelihood that the data points at the next time point will appear
Calculate the logarithmic loss and use it as an change score

In [None]:
# Let's install & import changefinder python library 
!pip install changefinder
import changefinder

In [None]:
from scipy import stats
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')


def chng_detection(col, _r=0.01, _order=1, _smooth=10):
    cf = changefinder.ChangeFinder(r=_r, order=_order, smooth=_smooth)
    ch_df = pd.DataFrame()
    ch_df[col] = df[col].resample('D').mean()
    
    # calculate the change score
    ch_df['change_score'] = [cf.update(i) for i in ch_df[col]]
    ch_score_q1 = stats.scoreatpercentile(ch_df['change_score'], 25) 
    ch_score_q3 = stats.scoreatpercentile(ch_df['change_score'], 75) 
    thr_upper = ch_score_q3 + (ch_score_q3 - ch_score_q1) * 3
    
    anom_score = hv.Curve(ch_df['change_score'])
    anom_score_th = hv.HLine(thr_upper).opts(color='red', line_dash="dotdash")
    
    anom_points = [[ch_df.index[i],ch_df[col][i]] for i, score in enumerate(ch_df["change_score"]) if score > thr_upper]
    org = hv.Curve(ch_df[col],label=col).opts(yformatter='%.1fkw')
    detected = hv.Points(anom_points, label=f"{col} detected").opts(color='red', legend_position='bottom', size=5)

    return ((anom_score * anom_score_th).opts(title=f"{col} Change Score & Threshold") + \
            (org * detected).opts(title=f"{col} Detected Points")).opts(opts.Curve(width=800, height=300, show_grid=True, tools=['hover'])).cols(1)

* Discounting parameter  𝑟(0<𝑟<1)  : The smaller this value, the greater the influence of the past data points and the greater the variation in the change score
* Order parameter for AR  𝑜𝑟𝑑𝑒𝑟  : How far past data points are included in the model
* Smoothing window  𝑠𝑚𝑜𝑜𝑡ℎ  : The greater this parameter is, the easier it is to capture the essential changes rather than the outliers, but if it is too large, it will be difficult to capture the changes themselves

In [None]:
chng_detection('use', _r=0.001, _order=1, _smooth=3)

In [None]:
df.columns

In [None]:
chng_detection('Furnace', _r=0.001, _order=1, _smooth=3)

---