In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
import featuretools as ft

In [5]:
# Read meta data
meta = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/meta_open.csv', 
                   index_col='uid', parse_dates=['dataend','datastart'], dayfirst=True)#The data will be messed up withou specifying dayfirst


# Read energy data
temporal = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/temp_open_utc_complete.csv', 
                   index_col='timestamp', parse_dates=True)#.tz_localize('utc')

# Read energy data for each given buildingname
single_building = 'Office_Benthe'
single_timezone = meta.T[single_building].timezone
startdate = meta.T[single_building].datastart
enddate = meta.T[single_building].dataend
single_building_energy = temporal[single_building].tz_convert(single_timezone).truncate(before = startdate, 
                                                            after = enddate)#.fillna(method='bfill').fillna(method='ffill')

# single_building_energy = single_building_energy.dropna()
# print(startdate)
# print(single_building_energy)
# single_building_energy, some nan data


# Get weather data for given building
weatherfile_name = meta.T[single_building].newweatherfilename
weather_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                        weatherfile_name),index_col='timestamp', parse_dates=True, na_values='-9999')
weather_data = weather_data.tz_localize(single_timezone, ambiguous = 'infer')
weather_point = ['TemperatureC', 'Humidity'] #Whatever weather features you want, put in the list
weather_point_list=[]
for point in weather_point:
    point_data = weather_data[[point]]
    weather_point_list.append(point_data)
    all_weather_point = pd.concat(weather_point_list,axis=1) #axis=1, rowwise concat
    all_weather_point = all_weather_point[~all_weather_point.index.duplicated()]#To get rid of duplicated index
    all_weather_point = all_weather_point.reindex(pd.DatetimeIndex(start = all_weather_point.index[0], 
                                                                   periods=len(single_building_energy), 
                                                                   freq='H')).fillna(method='ffill').fillna(method='bfill')
# print(single_building_energy.shape, all_weather_point.shape)
    
# Get schedule data for given building
schedule_name = meta.T[single_building].annualschedule
schedule_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                        schedule_name),index_col=0, header=None, parse_dates=True)
schedule_data = schedule_data.tz_localize(single_timezone, ambiguous = 'infer')
schedule_data.columns = ['seasonal']
schedule_data = schedule_data.reindex(pd.DatetimeIndex(start = schedule_data.index[0], periods=len(single_building_energy), 
                                                       freq='H')).fillna(method='ffill').fillna(method='bfill')
# print(schedule_data.shape)

# Create TimeSeriesSplit
# get the month number for splitting
months = np.array(single_building_energy.index.month.unique())
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)
train_test_list = []
for train_index, test_index in tscv.split(months):
    months_train, months_test = months[train_index], months[test_index]
    train_test_list.append([months_train, months_test])
# Add the 'every-four-month' version, 5th month is duplicated in Clayton's notebook
train_test_list.append([np.concatenate([months[0:3],months[4:7],months[8:11]]),
                                     np.array([months[3],months[7],months[11]])])
for train_index, test_index in train_test_list: #get rid of the 'array', extract the numeric months from the list
#     print(train_index, test_index)
    months_for_train = train_index
    months_for_test = test_index
# print(months_for_train,months_for_test)
    
# Create features and labels, this example did not loop but just for one TimeSeriesSplit, 9monthsTrain and 3monthsTest
def get_features_and_labels(train_or_test):
    global single_building_energy
    global all_weather_point
    global schedule_data
    single_building_energy_n = single_building_energy[single_building_energy.index.month.isin(train_or_test)]
#     single_building_energy_n.to_csv('/Users/t.wang/Desktop/' + 'single_building_energy_n' +'.csv', index=True)
    
    all_weather_point_n = all_weather_point[all_weather_point.index.month.isin(train_or_test)]
#     all_weather_point_n.to_csv('/Users/t.wang/Desktop/' + 'all_weather_point_n' +'.csv', index=True)
    
    schedule_data_n = schedule_data[schedule_data.index.month.isin(train_or_test)]
#     schedule_data_n.to_csv('/Users/t.wang/Desktop/' + 'schedule_data_n' +'.csv', index=True)
    
#     print(single_building_energy_n.shape, all_weather_point_n.shape, schedule_data_n.shape)
    #rename _n is required, otherwise the function will run on top of incomplete dataset after one running(after traindata, testdata disappeared)
    features = pd.merge(pd.get_dummies(single_building_energy_n.index.hour),
                         pd.get_dummies(single_building_energy_n.index.dayofweek), right_index=True, left_index=True)
#     features = pd.merge(features, pd.get_dummies(schedule_data_n.reset_index(drop=True)), right_index=True, left_index=True)
    features = pd.merge(features, schedule_data_n.reset_index(drop=True), right_index=True, left_index=True)
    features['seasonal_num'] = features.seasonal.map({'Break':0, 'Regular':1, 'Holiday':2, 'Summer':3})
    features = features.drop('seasonal', axis=1)
#     instead of get dummies in schedule data, map all strings to numbers helps to solve the inconsistency of schedules
#     may result in information loss
    features = pd.concat([features, all_weather_point_n.reset_index(drop=True)], axis=1) #.reset_index(drop=True) to get rid of the time index, otherwise two sets data will stratify
    features = features.fillna(method='ffill').fillna(method='bfill')
    features = np.array(features)
    labels = single_building_energy_n.values
    return features, labels


# test on model and calculate errors
X_train, y_train = get_features_and_labels(train_or_test=months_for_train)
compare = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis=1)
print(compare)
X_test, y_test = get_features_and_labels(train_or_test=months_for_test)
# print(y_test)
random_forest = RandomForestRegressor(n_estimators=100, random_state=42, verbose=True)
random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)
# predictions = random_forest.predict(X_test)
# # print(predictions)
# errors = abs(predictions - y_test)
# MAPE = 100 * np.mean((errors / y_test))
# NMBE = 100 * (sum(y_test - predictions) / (pd.Series(y_test).count() * np.mean(y_test)))
# CVRSME = 100 * ((sum((y_test - predictions)**2) / (pd.Series(y_test).count()-1))**(0.5)) / np.mean(y_test)
# RSQUARED = r2_score(y_test, predictions)
# temporary = pd.DataFrame(columns=['building_name','MAPE','NMBE','CVRSME','RSQUARED'])
# temporary.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/' + 'test' +'.csv', index=False)
# metrics_prev = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/' + 'test' +'.csv')
# df = pd.DataFrame([[single_building, MAPE, NMBE, CVRSME, RSQUARED]],
#                   columns=['building_name','MAPE','NMBE','CVRSME','RSQUARED'])
# metrics = pd.concat([df, metrics_prev])
# metrics.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/' + 'test' +'.csv', index=False)


# all_weather_point
# schedule_data
# single_building_energy
# train_test_list
# X_train,y_train
# X_train.shape,y_train.shape
# X_test,y_test
# X_test.shape,y_test.shape

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


       0    1    2    3    4    5    6    7    8    9   ...   25   26   27  \
0     1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1     0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2     0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3     0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4     0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5     0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
6     0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
7     0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0   
8     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  ...  0.0  0.0  0.0   
9     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0   
10    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
11    0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.4983030361099856