In [None]:
# Import modules

#import wget
#import constants as const 
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import median_abs_deviation
import statistics
import seaborn as sns
import random
import sys
import os
from pathlib import Path
PROJECT_DIR =Path(os.path.abspath('')).parents[1]

sys.path.append(os.fspath(PROJECT_DIR))
from pipeline.definitions import *
from pipeline.preprocessing.data_preprocessing import statistical_prepro
from pipeline.preprocessing_fx import data_exploration, check_outliers

from os import listdir
from os.path import isfile, join

import matplotlib as mpl
import colorsys
import matplotlib.colors as mc

### Select graphic settings

In [None]:
graph_setting="notebook" #or "article"

In [None]:
if graph_setting=="article":
    
    #journal-quality parameter settings
    resolution_factor=2
    desired_font=10

elif graph_setting=="notebook":
    resolution_factor=1
    desired_font=12
    
#conversion factors
cm_to_inch=0.393701
classic_proportion=6.4/4.8
golden_rate=1.618

#Elsevier column width is 8.4 cm, double-column width is 17.7 cm (in inches: 3.31 and 6.97)
small_figsize=(resolution_factor*3.31, resolution_factor*3.31/classic_proportion)
big_figsize=(resolution_factor*6.97, resolution_factor*6.97/classic_proportion)

#changings regarding fonttypex
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = "Arial"

font_size=resolution_factor*desired_font


#define path for figures
figures_path=FIGURES
#check existance of figure path
if not os.path.exists(figures_path):
    print("The selected directory to store figures does not exist")

## Data Import

In [None]:
# Get dataset and load it as a Pandas DataFrame
raw_data = pd.read_csv(os.path.join(DATA_RAW,'all_buildings_dataset'))
print("Data shape:")
print(raw_data.shape)
n_keys=len(raw_data.ID.unique())
print("\nThe dataset contains "+str(n_keys)+" case studies\n")
print("Case studies contain an average of "+str(int(raw_data.shape[0]/n_keys))+" time steps and "+str(raw_data.shape[1]-1)+" variables.\n")

In [None]:
raw_data.columns #check dataset variables names

Notice that the dataset is provided with hourly data resolution, hence it may be easily seen that it corresponds to two years of measurements. 

## Data description and check

In [None]:
# Select columns 
data = raw_data.copy()
print('Complete list of column names:')
print(data.columns.values)
print("Data description: ")
data_description= pd.read_csv(os.path.join(DATASETS,'buildings_data_description.csv'))
display(data_description.drop("Comment", axis=1))

#### Check statistics of variables

In [None]:
data.describe().transpose()

#### Counting NaN values in all columns

In [None]:
nan_count = data.isna().sum()
print(nan_count)

## Data Exploration and I/O variables definition

#### Remove columns with many NaN values and then remove elements with NaN values

In [None]:
data = data.drop(['UV'],axis=1)

# Drop out all instances with NaN values
data = data.dropna(axis=0)
print(data.shape)

#### Check the datasets shape for each single case study

In [None]:
subdatasets_info=data.groupby("ID").describe()
subdatasets_info[('T_a', 'count')].astype(int)

#### Define input(s) and output(s)

In [None]:
dep_var=['P']
# include all the other variables in the x datasets (inputs), except the case study key (ID)
ind_var= [var for var in list(data.columns) if ((var not in dep_var) and (var not in ["ID"] ))]
#create dataframes
X_data = data[ind_var]
Y_data = data[dep_var]
name_data = data['ID']


# Convert dataframes to numpy arrays
X_data = X_data.to_numpy(dtype='float64')
Y_data = Y_data.to_numpy(dtype='float64')

#### Overview of the disribution of some variables

In [None]:
variables=['T_a', 'P', 'T_b', 'Wv', 'G', 's_H', 'c_H']
#fig, axs=plt.subplots(int(len(variables)/2), 2, figsize=(6.4, 1.2*len(variables)))
fig, axs=plt.subplots(int(len(variables)/3), 3, figsize=big_figsize)
for var, ax in zip(variables, axs.flatten()):
    data_exploration(data, data_description, var, subplot=ax, caseStudy="random", font_size=font_size)
    
plt.suptitle("Data distribution from a single case study", fontsize=font_size)
plt.tight_layout()

### Cross correlation analysis

In [None]:
#select variables for correlation analysis
#columns_for_corr=['T_a', 'P', 'T_b', 'G', 'dayType', 'T_eq_3', 'T_eq_6', 'T_eq_12', 'T_eq_24']
columns_for_corr=['T_a', 'P', 'T_b', 'DP', 'RH', 'Wv', 'Wgv', 'atmP', 'G', 's_Wa', 'c_Wa',
       's_H', 'c_H', 'dayType', 's_D', 'c_D', 'T_eq_3',
       'T_eq_6', 'T_eq_12', 'T_eq_24']
#dsiplay one random example
IDs=data.ID.unique()
fig, axs=plt.subplots(1, figsize=(4.8, 4))
caseStudy=random.choice(IDs)
df_corr=data.loc[data.ID==caseStudy,  columns_for_corr].corr()
#display(df_corr.round(2))
sns.heatmap(df_corr, vmin=0.0, vmax=1.0, ax=axs)
plt.title("Sample correlation Matrix")
#display all the subcases
fig, axs=plt.subplots(int(len(IDs)/6), 6, figsize=(9, 0.25*len(IDs)))
for caseStudy, ax in zip(IDs, axs.flatten()):
    df_corr=data.loc[data.ID==caseStudy, columns_for_corr].corr()
    #display(df_corr.round(2))
    sns.heatmap(df_corr, vmin=0.0, vmax=1.0, ax=ax, cbar=False)
    ax.set_xticks([])
    ax.set_yticks([])
plt.suptitle("Correlation matrix from all the case studies")
plt.tight_layout()

In [None]:
tuples=pd.DataFrame([ [ "T_a","P"], [ "G", "P"], [ "Wv", "P"], [ "T_a", "T_b"],[ "Wgv", "Wv"],[ "DP", "T_a"]])

var1_list=list(tuples.iloc[:,0].values)                    
var2_list=list(tuples.iloc[:,1].values)
fig, axs=plt.subplots(int(np.shape(tuples)[0]/3), 3, figsize=big_figsize)

for var1, var2, ax in zip(var1_list, var2_list, axs.flatten()):
    data_exploration(data, data_description, var1, var2=var2, subplot=ax, font_size=font_size-2)
plt.suptitle("Data distribution from a single case study", fontsize=font_size)
plt.tight_layout()