In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
import time


In [26]:
start = time.time()

# Read meta data
meta = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/meta_open.csv', 
                   index_col='uid', parse_dates=['dataend','datastart'], dayfirst=True)#The data will be messed up withou specifying dayfirst


# Read energy data
temporal = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/temp_open_utc_complete.csv', 
                   index_col='timestamp', parse_dates=True)#.tz_localize('utc')

def loopModels_and_Metrics(ml_Models_names, ml_Models, weatherPoints, n_timeSeriesSplits, buildingNames):  
    print('\n\n' + ml_Models_names + '\n_____________')
    buildingindex = 0
    for single_building in buildingNames:
        buildingindex+=1
        print('Modelling:' + single_building)
        
        # Read energy data for each given buildingname
        single_timezone = meta.T[single_building].timezone
        startdate = meta.T[single_building].datastart
        enddate = meta.T[single_building].dataend
        single_building_energy = temporal[single_building].tz_convert(single_timezone).truncate(before = startdate, 
                                                            after = enddate)#.fillna(method='bfill').fillna(method='ffill')
                                                            # single_building_energy, some missing data


        # Get weather data for given building
        weatherfile_name = meta.T[single_building].newweatherfilename
        weather_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                                weatherfile_name),index_col='timestamp', parse_dates=True, na_values='-9999')
        weather_data = weather_data.tz_localize(single_timezone, ambiguous = 'infer')
        weather_point_list=[]
        for point in weatherPoints:
            point_data = weather_data[[point]]
            weather_point_list.append(point_data)
            all_weather_point = pd.concat(weather_point_list,axis=1) #axis=1, rowwise concat
            all_weather_point = all_weather_point[~all_weather_point.index.duplicated()]#To get rid of duplicated index
            all_weather_point = all_weather_point.reindex(pd.DatetimeIndex(start = all_weather_point.index[0], 
                                                                           periods=len(single_building_energy), 
                                                                           freq='H')).fillna(method='ffill').fillna(method='bfill')
#             in some cases, there are more than 1 data in the same hour, creating more than 8760 points
#             to make them consistent, take the first index minuits, based on the number of energy data,
#             transform them into hourly data. Then we get the same number of energy data (mostly8760)
#             DatatimeIndex them, reindex then is able to match and select those hour with the minuites
#             same as first index, regulating the data to be consistent with number of energy points, get
#             rid of the repeated weather data in the same hour.
    
        # Get schedule data for given building
        schedule_name = meta.T[single_building].annualschedule
        schedule_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                                schedule_name),index_col=0, header=None, parse_dates=True)
        schedule_data = schedule_data.tz_localize(single_timezone, ambiguous = 'infer')
        schedule_data.columns = ['seasonal']
        schedule_data = schedule_data.reindex(pd.DatetimeIndex(start = schedule_data.index[0], periods=len(single_building_energy), 
                                                               freq='H')).fillna(method='ffill').fillna(method='bfill')
#         same trick is applied to selecting schedule data

        # Create TimeSeriesSplit
        # get the month number for splitting
#         months = np.array(single_building_energy.index.month.unique())
        weeks = range(1,53)
#         tscv = TimeSeriesSplit(n_splits=n_timeSeriesSplits)
        train_test_list = []
#         for train_index, test_index in tscv.split(months):
#             months_train, months_test = months[train_index], months[test_index]
#             train_test_list.append([months_train, months_test])

        # Mannual distribute the test and trained weeks instead of TimeSeriesSplit
        train_test_list.append([np.array(weeks[0:4]),np.array(weeks[4:5])])
        train_test_list.append([np.array(weeks[10:14]),np.array(weeks[14:15])])
        train_test_list.append([np.array(weeks[20:24]),np.array(weeks[24:25])])
        train_test_list.append([np.array(weeks[30:34]),np.array(weeks[34:35])])
        train_test_list.append([np.array(weeks[40:44]),np.array(weeks[44:45])])
#         train_test_list.append([np.array(months[0:9]),np.array(months[9:12])])
#         train_test_list.append([np.concatenate([months[0:3],months[4:7],months[8:11]]),
#                                      np.array([months[3],months[7],months[11]])])
        index = 0 #index for each TimeSeries cv
        for train_index, test_index in train_test_list: #get rid of the 'array', extract the numeric months from the list
            print(train_index, test_index)
            weeks_for_train = train_index
            weeks_for_test = test_index

            # Create features and labels under last 'for' loop such that all TimeSeriesSplit could be implenmented
            def get_features_and_labels(train_or_test):
                nonlocal single_building_energy #nonlocal means: "look for this variable in the outer scope"
                nonlocal all_weather_point
                nonlocal schedule_data
                single_building_energy_n = single_building_energy[single_building_energy.index.week.isin(train_or_test)]
                all_weather_point_n = all_weather_point[all_weather_point.index.week.isin(train_or_test)]
                schedule_data_n = schedule_data[schedule_data.index.week.isin(train_or_test)]
                #rename _n is required, otherwise the function will run on top of incomplete dataset after one running(after traindata, testdata disappeared)
                
                
                '''Issues here, the shape of energy, weather and schedule is differet after above code, 
                resulting in inconsistent samples and lables, solved by transforming the timezone'''

                
                features = pd.merge(pd.get_dummies(single_building_energy_n.index.hour),
                                     pd.get_dummies(single_building_energy_n.index.dayofweek), right_index=True, left_index=True)
#                 features = pd.merge(features, pd.get_dummies(schedule_data_n.reset_index(drop=True)), right_index=True, left_index=True)
                features = pd.merge(features, schedule_data_n.reset_index(drop=True), right_index=True, left_index=True)
                features['seasonal_num'] = features.seasonal.map({'Break':0, 'Regular':1, 'Holiday':2, 'Summer':3})
                features = features.drop('seasonal', axis=1)
        #     instead of get dummies in schedule data, map schedule strings to numbers in just one column helps to solve the inconsistency of schedules
        #     may result in information loss
                features = pd.concat([features, all_weather_point_n.reset_index(drop=True)], axis=1) 
                #.reset_index(drop=True) to get rid of the time index, otherwise two sets data will stratify
                features = features.fillna(method='ffill').fillna(method='bfill')
                features = np.array(features)
                labels = single_building_energy_n.values
                return features, labels


            # test on model and calculate errors
            X_train, y_train = get_features_and_labels(train_or_test=weeks_for_train)
            X_test, y_test = get_features_and_labels(train_or_test=weeks_for_test)
#             print(weeks_for_train, X_train.shape, y_train.shape, weeks_for_test, X_test.shape, y_test.shape)
#             print(X_train, X_test)
            ml_Models.fit(X_train, y_train)
            predictions = ml_Models.predict(X_test)
            errors = abs(predictions - y_test)
            MAPE = 100 * np.mean((errors / y_test))
            NMBE = 100 * (sum(y_test - predictions) / (pd.Series(y_test).count() * np.mean(y_test)))
            CVRSME = 100 * ((sum((y_test - predictions)**2) / (pd.Series(y_test).count()-1))**(0.5)) / np.mean(y_test)
            RSQUARED = r2_score(y_test, predictions)
            
            index+=1
            if (buildingindex==1):
#               create the csv at the start of looping each metrics for each building
                temporary = pd.DataFrame(columns=['buildingName','MAPE','NMBE','CVRSME','RSQUARED',
                                                  'trainedMonths_','testMonths_'])
                temporary.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/weeks_result/' 
                                 + ml_Models_names + '_metrics_cross_validation_' + str(index) + '.csv', index=False)
#           read and the csv and metrics result
            metrics_prev = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/weeks_result/' 
                                 + ml_Models_names + '_metrics_cross_validation_' + str(index) + '.csv')
            df = pd.DataFrame([[single_building, MAPE, NMBE, CVRSME, RSQUARED,str(weeks_for_train),str(weeks_for_test)]],
                              columns=['buildingName','MAPE','NMBE','CVRSME','RSQUARED',
                                       'trainedMonths_','testMonths_'])
#           write the csv
            metrics = pd.concat([df, metrics_prev], sort=False)
#           export csv
            metrics.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/weeks_result/' 
                                 + ml_Models_names + '_metrics_cross_validation_' + str(index) + '.csv', index=False)

    
ml_Models_lists = [['RandomForestRegressor', RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)]]
weatherPoints = ['TemperatureC', 'Humidity']
n_timeSeriesSplits = 3
buildingNames = meta.dropna(subset=['annualschedule']).index[100:101] #drop buildings with missing schedule

for elem in ml_Models_lists:
#     ml_Models_names = elem[0], ml_Models = elem[1], not sure why this gives warning 'no n_estimator'
    loopModels_and_Metrics(ml_Models_names = elem[0], ml_Models=elem[1],weatherPoints=weatherPoints,
                       buildingNames=buildingNames, n_timeSeriesSplits=n_timeSeriesSplits)
    
    
end = time.time()
elapsed = end - start 
print('Time for basic looping:'+ time.strftime("%H:%M:%S", time.gmtime(elapsed)))
# all_weather_point
# schedule_data
# single_building_energy
# train_test_list
# X_train,y_train
# X_train.shape,y_train.shape
# X_test,y_test
# X_test.shape,y_test.shape
# buildingNames




RandomForestRegressor
_____________
Modelling:Office_Mark
[1 2 3 4] [5]
[11 12 13 14] [15]
[21 22 23 24] [25]
[31 32 33 34] [35]
[41 42 43 44] [45]
Time for basic looping:00:00:03


# Check the trained months

In [54]:
preview = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/RandomForestRegressor_metrics_cross_validation_1.csv')
preview.head(10)

Unnamed: 0,buildingName,MAPE,NMBE,CVRSME,RSQUARED,trainedMonths_,testMonths_
0,UnivLab_Tracy,11.571059,1.746917,16.036343,0.432561,[1 2 3],[4 5 6]
1,UnivLab_Tracie,7.379363,5.686699,9.761787,0.592209,[1 2 3],[4 5 6]
2,UnivLab_Terrie,6.512372,4.410711,9.115582,0.60155,[1 2 3],[4 5 6]
3,UnivLab_Taylor,8.740142,-1.536381,10.919086,0.515439,[1 2 3],[4 5 6]
4,UnivLab_Tami,12.210618,-5.63933,14.834996,0.433792,[1 2 3],[4 5 6]
5,UnivLab_Suzette,27.012023,-17.65116,33.387075,-0.098753,[1 2 3],[4 5 6]
6,UnivLab_Susan,40.334372,-15.030792,40.729524,0.626795,[1 2 3],[4 5 6]
7,UnivLab_Santiago,7.166799,0.540198,9.153027,0.672787,[1 2 3],[4 5 6]
8,UnivLab_Priscilla,4.176357,-3.222325,5.039967,0.502588,[1 2 3],[4 5 6]
9,UnivLab_Preston,9.162368,-3.279006,11.130809,0.788552,[1 2 3],[4 5 6]
