# Let's parse the irish dataset

# Data files

In [1]:
%%time
from pathlib import Path
import pandas as pd
import numpy as np
from irish_preprocess import preprocess_irish_data

CPU times: user 1.47 s, sys: 324 ms, total: 1.79 s
Wall time: 2.64 s


In [2]:
irish_path = Path('Data/Irish_dataset/CER Electricity Revised March 2012')
output_path =Path('Data/Irish_dataset/before_raw_data')
output_path.mkdir(exist_ok = True)
result_path = Path('Data/Irish_dataset/raw_data')
result_path.mkdir(exist_ok = True)
preprocessed_path =Path('Data/Irish_dataset/preprocessed')
preprocessed_path.mkdir(exist_ok = True)
if not (output_path/'raw_data_df.pkl').exists():      
    preprocess_irish_data(irish_path, output_path)
raw_data_df = pd.read_pickle(output_path/'raw_data_df.pkl')
allocation_df = pd.read_pickle(output_path/'raw_allocation_df.pkl')
yearly_info_df = pd.read_pickle(output_path/'raw_yearly_info_df.pkl')

  date = pd.to_datetime(


## Data df

In [3]:
# only use Residential profiles which have answered the survey
# There are also 'Other' profiles that have answered the survey but they have LOTS of missing data 
residential_profiles = allocation_df[allocation_df.type == 'Residential'].index
data_df = raw_data_df.loc[yearly_info_df.index.intersection(residential_profiles)]


In [4]:
data_df

date_time,2009-07-14 00:00:00,2009-07-14 00:30:00,2009-07-14 01:00:00,2009-07-14 01:30:00,2009-07-14 02:00:00,2009-07-14 02:30:00,2009-07-14 03:00:00,2009-07-14 03:30:00,2009-07-14 04:00:00,2009-07-14 04:30:00,...,2010-12-31 19:00:00,2010-12-31 19:30:00,2010-12-31 20:00:00,2010-12-31 20:30:00,2010-12-31 21:00:00,2010-12-31 21:30:00,2010-12-31 22:00:00,2010-12-31 22:30:00,2010-12-31 23:00:00,2010-12-31 23:30:00
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,0.362,0.064,0.119,0.023,0.140,0.036,0.108,0.083,0.056,0.129,...,0.117,0.076,0.136,0.079,0.132,0.084,0.116,0.147,0.258,0.280
1003,0.692,0.381,0.380,0.379,0.346,0.266,0.280,0.346,0.383,0.378,...,1.172,0.782,0.835,0.850,0.786,0.898,0.802,0.835,0.839,0.775
1004,1.310,1.142,1.218,0.903,0.790,0.814,0.924,0.537,0.296,0.402,...,1.754,1.445,1.502,1.494,1.612,1.819,1.422,1.393,1.453,1.371
1005,0.177,0.172,0.152,0.158,0.159,0.146,0.165,0.141,0.164,0.141,...,1.633,0.828,0.677,0.950,0.507,0.634,0.455,0.604,0.520,0.611
1008,0.860,0.371,0.413,0.396,0.384,0.309,0.154,0.251,0.155,0.194,...,0.694,1.367,1.776,2.093,2.073,0.793,0.722,0.758,0.743,0.728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7436,0.168,0.166,0.165,0.164,0.164,0.206,0.243,0.241,0.238,0.237,...,1.398,2.898,0.698,0.611,0.738,0.605,0.516,0.383,0.404,0.359
7437,0.256,0.219,0.304,0.238,0.284,0.255,0.265,0.275,0.245,0.281,...,0.432,0.505,0.424,0.500,0.356,0.533,0.351,0.461,0.933,0.775
7440,0.304,0.188,0.157,0.227,0.158,0.174,0.197,0.123,2.794,0.752,...,3.698,2.718,1.155,1.107,1.019,0.623,0.526,0.533,0.514,0.750
7442,0.584,0.207,0.196,0.196,0.199,0.121,0.092,0.090,0.094,0.096,...,0.098,0.154,0.110,0.134,0.128,0.096,0.165,0.097,0.171,0.097


In [5]:
data_df.to_pickle(result_path/'raw_data_df.pkl')

# Info df 

In [6]:
# filter out the same profiles as the data_df 
yearly_info_df = yearly_info_df.loc[data_df.index]

In [8]:
yearly_info_df.to_pickle(result_path/'raw_info_df_features.pkl')

In [9]:
yearly_info_df = (
    yearly_info_df.fillna(dict(
        age = -1, 
        home_type = 'Unknown', 
        build_year = -1, 
        home_age = -1, 
        floor_area = -1, 
        number_of_bedrooms = -1,
    ))
    .astype(dict(
        age = 'int8', 
        build_year = 'int16', 
        home_age = 'int8', 
        floor_area = 'int32', 
        number_of_bedrooms = 'int8',
    ))
    .pipe(lambda x: pd.concat([x, pd.get_dummies(x[['home_type', 'cooking']], prefix = ['home_type', 'cooking'])], axis = 1))
    .drop(columns = ['home_type', 'cooking'])
)
yearly_info_df.to_pickle(result_path/'raw_info_df_numerical.pkl')

## Daily info df

In [10]:
from util import transform_raw_data_and_save, check_dataset

yearly_data_df, daily_data_df, yearly_info_df, daily_info_df = transform_raw_data_and_save(
    raw_data_df = result_path/'raw_data_df.pkl', 
    yearly_info_df = result_path/'raw_info_df_numerical.pkl', 
    result_path = preprocessed_path, 
    weather_city = 'Dublin',
    holiday_country = 'ireland', 
    year_to_use_as_index = 2010
)
check_dataset((yearly_data_df, daily_data_df, yearly_info_df, daily_info_df))