In [None]:
# Import modules

#import wget
#import constants as const 
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import median_abs_deviation
import statistics
import seaborn as sns
import random
import sys
import os
from pathlib import Path
PROJECT_DIR =Path(os.path.abspath('')).parents[1]

sys.path.append(os.fspath(PROJECT_DIR))
from pipeline.definitions import *
from pipeline.preprocessing_fx import data_exploration, check_outliers, statistical_prepro

from os import listdir
from os.path import isfile, join

import matplotlib as mpl
import colorsys
import matplotlib.colors as mc

### Select graphic settings

In [None]:
graph_setting="notebook" #or "article"

In [None]:
if graph_setting=="article":
    
    #journal-quality parameter settings
    resolution_factor=2
    desired_font=10

elif graph_setting=="notebook":
    resolution_factor=1
    desired_font=12
    
#conversion factors
cm_to_inch=0.393701
classic_proportion=6.4/4.8
golden_rate=1.618

#Elsevier column width is 8.4 cm, double-column width is 17.7 cm (in inches: 3.31 and 6.97)
small_figsize=(resolution_factor*3.31, resolution_factor*3.31/classic_proportion)
big_figsize=(resolution_factor*6.97, resolution_factor*6.97/classic_proportion)

#changings regarding fonttypex
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = "Arial"

font_size=resolution_factor*desired_font


#define path for figures
figures_path=FIGURES
#check existance of figure path
if not os.path.exists(figures_path):
    print("The selected directory to store figures does not exist")

## Data Import

In [None]:
# Get dataset and load it as a Pandas DataFrame
raw_data = pd.read_csv(os.path.join(DATA_RAW,'all_buildings_dataset'))
print("Data shape:")
print(raw_data.shape)
n_keys=len(raw_data.ID.unique())
print("\nThe dataset contains "+str(n_keys)+" case studies\n")
print("Case studies contain an average of "+str(int(raw_data.shape[0]/n_keys))+" time steps and "+str(raw_data.shape[1]-1)+" variables.\n")

In [None]:
raw_data.columns #check dataset variables names

Notice that the dataset is provided with hourly data resolution, hence it may be easily seen that it corresponds to two years of measurements. 

## Data description and check

In [None]:
# Select columns 
data = raw_data.copy()
print('Complete list of column names:')
print(data.columns.values)
print("Data description: ")
data_description= pd.read_csv(os.path.join(DATASETS,'buildings_data_description.csv'))
display(data_description.drop("Comment", axis=1))

#### Check statistics of variables

In [None]:
data.describe().transpose()

#### Counting NaN values in all columns

In [None]:
nan_count = data.isna().sum()
print(nan_count)

#### Remove columns with many NaN values and then remove elements with NaN values

In [None]:
data = data.drop(['UV'],axis=1)

# Drop out all instances with NaN values
data = data.dropna(axis=0)
print(data.shape)

## Data Filtering

In [None]:
# Discard anomalous data from power curve (case study by case Study)
good_data=data.copy()
IDs=good_data.ID.unique()
for caseStudy in IDs:
    #access a case study
    df_caseStudy = good_data.loc[good_data.ID==caseStudy, :]
    good_data=good_data.loc[good_data.ID!=caseStudy, :]

    #operate statistical preprocessing of data
    good_df_caseStudy, P_estimated, T_estimated =statistical_prepro(df_caseStudy)

    #show original VS filtered data
    legend=["Original", "Filtered"]
    fig, axs=plt.subplots(1, figsize=big_figsize)
    axs.plot(df_caseStudy.P)
    axs.plot(good_df_caseStudy.P)
    axs.legend(legend, fontsize=font_size)
    axs.tick_params(labelsize=font_size)
    axs.set_xlabel("Time $[h]$", fontsize=font_size)
    axs.set_ylabel("Electrical Load $[kWh]$", fontsize=font_size)
    axs.set_xticks([])
    
    good_data=pd.concat([good_data, good_df_caseStudy])

    fig2, axs=plt.subplots(1, figsize=big_figsize)
    check_outliers(data.loc[data.ID==caseStudy,:], good_data.loc[good_data.ID==caseStudy, :], data_description, P_estimated, T_estimated,  subplot=axs, font_size=font_size)
    axs.set_title("Power curve for statistical filtering of data for "+caseStudy)