In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
import time
import random

In [32]:
start = time.time()

# Read meta data
meta = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/meta_open.csv', 
                   index_col='uid', parse_dates=['dataend','datastart'], dayfirst=True)#The data will be messed up withou specifying dayfirst


# Read energy data
temporal = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/input/temp_open_utc_complete.csv', 
                   index_col='timestamp', parse_dates=True)#.tz_localize('utc')

def loopModels_and_Metrics(ml_Models_names, ml_Models, weatherPoints, n_timeSeriesSplits, buildingNames):  
    print('\n\n' + ml_Models_names + '\n_____________')
    weeksnumIndex = 0
    for single_building in buildingNames:
        print('Modelling:' + single_building)
        
        # Read energy data for each given buildingname
        single_timezone = meta.T[single_building].timezone
        startdate = meta.T[single_building].datastart
        enddate = meta.T[single_building].dataend
        single_building_energy = temporal[single_building].tz_convert(single_timezone).truncate(before = startdate, 
                                                            after = enddate)#.fillna(method='bfill').fillna(method='ffill')
                                                            # single_building_energy, some missing data


        # Get weather data for given building
        weatherfile_name = meta.T[single_building].newweatherfilename
        weather_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                                weatherfile_name),index_col='timestamp', parse_dates=True, na_values='-9999')
        weather_data = weather_data.tz_localize(single_timezone, ambiguous = 'infer')
        weather_point_list=[]
        for point in weatherPoints:
            point_data = weather_data[[point]]
            weather_point_list.append(point_data)
            all_weather_point = pd.concat(weather_point_list,axis=1) #axis=1, rowwise concat
            all_weather_point = all_weather_point[~all_weather_point.index.duplicated()]#To get rid of duplicated index
            all_weather_point = all_weather_point.reindex(pd.DatetimeIndex(start = all_weather_point.index[0], 
                                                                           periods=len(single_building_energy), 
                                                                           freq='H')).fillna(method='ffill').fillna(method='bfill')
#             in some cases, there are more than 1 data in the same hour, creating more than 8760 points
#             to make them consistent, take the first index minuits, based on the number of energy data,
#             transform them into hourly data. Then we get the same number of energy data (mostly8760)
#             DatatimeIndex them, reindex then is able to match and select those hour with the minuites
#             same as first index, regulating the data to be consistent with number of energy points, get
#             rid of the repeated weather data in the same hour.

        # Get schedule data for given building
        schedule_name = meta.T[single_building].annualschedule
        schedule_data = pd.read_csv(os.path.join('/Users/t.wang/Desktop/Dissertation/Python/input/',
                                                schedule_name),index_col=0, header=None, parse_dates=True)
        schedule_data = schedule_data.tz_localize(single_timezone, ambiguous = 'infer')
        schedule_data.columns = ['seasonal']
        schedule_data = schedule_data.reindex(pd.DatetimeIndex(start = schedule_data.index[0], periods=len(single_building_energy), 
                                                               freq='H')).fillna(method='ffill').fillna(method='bfill')
#         same trick is applied to selecting schedule data

       
        weeks = list(range(1,53))
        
        weeks_num_split = ([39,13],[39,1],[26,13],[26,1],[13,13],[13,1])#train test weeks split, namely 9months_train
#         3months_test, 9months_train 1week_test etc
        
        for s in weeks_num_split:
            weeksnumIndex+=1
            train_test_list = []
            train_test_split = ([s[0],s[1]],[s[0],s[1]],[s[0],s[1]],[s[0],s[1]],[s[0],s[1]])#the split weeks are cross validated 5 times    
           
            for k in train_test_split:
                random.shuffle(weeks)#randomly shuffle 52 weeks in a list and choose first n for train and s for test to avoid traintestweeks overlap
                random.seed(42) #for reproducibility
                train_test_list.append([weeks[0:k[0]], weeks[k[0]:k[0]+k[1]]])
        
            CV_list = [] #List to append all crossvalidated Coefficient of Variation (CV)
            for train_index, test_index in train_test_list: #get rid of the 'array', extract the numeric months from the list
    #             print(train_index,len(train_index), test_index,len(test_index))
                weeks_for_train = train_index
                weeks_for_test = test_index

                # Create features and labels under last 'for' loop such that all TimeSeriesSplit could be implenmented
                def get_features_and_labels(train_or_test):
                    nonlocal single_building_energy #nonlocal means: "look for this variable in the outer scope"
                    nonlocal all_weather_point
                    nonlocal schedule_data
                    single_building_energy_n = single_building_energy[single_building_energy.index.week.isin(train_or_test)]
                    all_weather_point_n = all_weather_point[all_weather_point.index.week.isin(train_or_test)]
                    schedule_data_n = schedule_data[schedule_data.index.week.isin(train_or_test)]
                    #rename _n is required, otherwise the function will run on top of incomplete dataset after one running(after traindata, testdata disappeared)

                    features = pd.merge(pd.get_dummies(single_building_energy_n.index.hour),
                                         pd.get_dummies(single_building_energy_n.index.dayofweek), right_index=True, left_index=True)
    #                 features = pd.merge(features, pd.get_dummies(schedule_data_n.reset_index(drop=True)), right_index=True, left_index=True)
                    features = pd.merge(features, schedule_data_n.reset_index(drop=True), right_index=True, left_index=True)
                    features['seasonal_num'] = features.seasonal.map({'Break':0, 'Regular':1, 'Holiday':2, 'Summer':3})
                    features = features.drop('seasonal', axis=1)
            #     instead of get dummies in schedule data, map schedule strings to numbers in just one column helps to solve the inconsistency of schedules
            #     may result in information loss
                    features = pd.concat([features, all_weather_point_n.reset_index(drop=True)], axis=1) 
                    #.reset_index(drop=True) to get rid of the time index, otherwise two sets data will stratify
                    features = features.fillna(method='ffill').fillna(method='bfill')
                    features = np.array(features)
                    labels = single_building_energy_n.values
                    return features, labels


                # test on model and calculate errors
                X_train, y_train = get_features_and_labels(train_or_test=weeks_for_train)
                X_test, y_test = get_features_and_labels(train_or_test=weeks_for_test)
    #             print(weeks_for_train, X_train.shape, y_train.shape, weeks_for_test, X_test.shape, y_test.shape)
    #             print(X_train, X_test)
                ml_Models.fit(X_train, y_train)


                predictions = ml_Models.predict(X_test)
    #             errors = abs(predictions - y_test)
    #             MAPE = 100 * np.mean((errors / y_test))
    #             NMBE = 100 * (sum(y_test - predictions) / (pd.Series(y_test).count() * np.mean(y_test)))
                CV = 100 * ((sum((y_test - predictions)**2) / (pd.Series(y_test).count()))**(0.5)) / np.mean(y_test)
    #             RSQUARED = r2_score(y_test, predictions)
                CV_list.append(CV)
#             print(CV_list)
            CV_mean = np.mean(CV_list)
#             print(CV_mean)

            if (weeksnumIndex==1):
    #               create the csv at the start of looping each weeksnumsplit for each building
                temporary = pd.DataFrame(columns=['buildingName','CV_1','CV_2','CV_3','CV_4','CV_5','CV_mean',
                                                      'trainedtestWeeks_'])
                temporary.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/New_scope/Modulated_features_selection_weeks/' 
                                 + ml_Models_names + '_metrics_cross_validation_mean_basic' + '.csv', index=False)
    #           read and the csv and metrics result
            metrics_prev = pd.read_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/New_scope/Modulated_features_selection_weeks/' 
                                 + ml_Models_names + '_metrics_cross_validation_mean_basic' + '.csv')
            df = pd.DataFrame([[single_building, CV_list[0],CV_list[1],CV_list[2],CV_list[3],CV_list[4],CV_mean
                                    ,s]],
                              columns=['buildingName','CV_1','CV_2','CV_3','CV_4','CV_5','CV_mean',
                                           'trainedtestWeeks_'])
    #           write the csv
            metrics = pd.concat([df, metrics_prev], sort=False)
    #           export csv
            metrics.to_csv('/Users/t.wang/Desktop/Dissertation/Python/WT-result/New_scope/Modulated_features_selection_weeks/' 
                                 + ml_Models_names + '_metrics_cross_validation_mean_basic' + '.csv', index=False)

    
ml_Models_lists = [['RandomForestRegressor', RandomForestRegressor(n_estimators=100, random_state=42,n_jobs=-1)]]
weatherPoints = ['TemperatureC', 'Humidity','Dew PointC', 
                 'WindDirDegrees']
n_timeSeriesSplits = 3
buildingNames = meta.dropna(subset=['annualschedule']).index #drop buildings with missing schedule

for elem in ml_Models_lists:
#     ml_Models_names = elem[0], ml_Models = elem[1], not sure why this gives warning 'no n_estimator'
    loopModels_and_Metrics(ml_Models_names = elem[0], ml_Models=elem[1],weatherPoints=weatherPoints,
                       buildingNames=buildingNames, n_timeSeriesSplits=n_timeSeriesSplits)
    
    
end = time.time()
elapsed = end - start 
print('Time for basic looping:'+ time.strftime("%H:%M:%S", time.gmtime(elapsed)))
# all_weather_point
# schedule_data
# single_building_energy
# train_test_list
# X_train,y_train
# X_train.shape,y_train.shape
# X_test,y_test
# X_test.shape,y_test.shape
# buildingNames




RandomForestRegressor
_____________
Modelling:Office_Abbey
Modelling:Office_Abigail
Modelling:Office_Al
Modelling:Office_Alannah
Modelling:Office_Aliyah
Modelling:Office_Allyson
Modelling:Office_Alyson
Modelling:Office_Amelia
Modelling:Office_Amelie
Modelling:Office_Anastasia
Modelling:Office_Andrea
Modelling:Office_Angelica
Modelling:Office_Angelina
Modelling:Office_Angelo
Modelling:Office_Annika
Modelling:Office_Ashanti
Modelling:Office_Asher
Modelling:Office_Aubrey
Modelling:Office_Autumn
Modelling:Office_Ava
Modelling:Office_Ayden
Modelling:Office_Ayesha
Modelling:Office_Benjamin
Modelling:Office_Benthe
Modelling:Office_Bianca
Modelling:Office_Bobbi
Modelling:Office_Brian
Modelling:Office_Bryon
Modelling:Office_Caleb
Modelling:Office_Cameron
Modelling:Office_Carissa
Modelling:Office_Carolina
Modelling:Office_Catherine
Modelling:Office_Cecelia
Modelling:Office_Charles
Modelling:Office_Clarissa
Modelling:Office_Clifton
Modelling:Office_Clinton
Modelling:Office_Cody
Modelling:Office

Modelling:UnivDorm_Alka
Modelling:UnivDorm_Alonzo
Modelling:UnivDorm_Alphonso
Modelling:UnivDorm_Alyshialynn
Modelling:UnivDorm_Alyssa
Modelling:UnivDorm_Antonio
Modelling:UnivDorm_April
Modelling:UnivDorm_Ashleigh
Modelling:UnivDorm_Avery
Modelling:UnivDorm_Camila
Modelling:UnivDorm_Candace
Modelling:UnivDorm_Cara
Modelling:UnivDorm_Carey
Modelling:UnivDorm_Carla
Modelling:UnivDorm_Carter
Modelling:UnivDorm_Casey
Modelling:UnivDorm_Cathal
Modelling:UnivDorm_Cathalina
Modelling:UnivDorm_Cecilia
Modelling:UnivDorm_Celeste
Modelling:UnivDorm_Chelsey
Modelling:UnivDorm_Cheri
Modelling:UnivDorm_Chester
Modelling:UnivDorm_Cheyenne
Modelling:UnivDorm_Christi
Modelling:UnivDorm_Christopher
Modelling:UnivDorm_Cian
Modelling:UnivDorm_Ciaran
Modelling:UnivDorm_Claudia
Modelling:UnivDorm_Clayton
Modelling:UnivDorm_Colton
Modelling:UnivDorm_Constance
Modelling:UnivDorm_Cooper
Modelling:UnivDorm_Corey
Modelling:UnivDorm_Cornelius
Modelling:UnivDorm_Curtis
Modelling:UnivDorm_Jeannette
Modelling:Univ