# Introduction: Testing Cyclical Encoding of Features for Machine Learning

In [1]:
import pandas as pd
import numpy as np

import glob

In [2]:
building_data_files = glob.glob('data/building*')
len(building_data_files)

40

In [3]:
data = pd.read_csv(building_data_files[10], parse_dates=['timestamp'], index_col=0).set_index('timestamp')
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100786 entries, 2014-01-01 06:15:00 to 2016-11-16 02:30:00
Data columns (total 2 columns):
temperature    100786 non-null float64
energy         100786 non-null float64
dtypes: float64(2)
memory usage: 2.3 MB


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class DateTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        field = X.index
        X["time_of_day"] = field.hour + field.minute / 60
        X["day_of_year"] = field.dayofyear
        return X


class CyclicalDateTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["sin_time_of_day"], X["cos_time_of_day"] = _cyclical_encoding(
            X["time_of_day"], period=24
        )
        X["sin_day_of_year"], X["cos_day_of_year"] = _cyclical_encoding(
            X["day_of_year"], period=366
        )
        return X


def _cyclical_encoding(series, period):
    base = 2 * np.pi * series / period
    return np.sin(base), np.cos(base)

In [7]:
from sklearn.pipeline import Pipeline

transforms = Pipeline(
    steps=[
        ("date_time_features", DateTimeFeatures()),
        ("cylical_date_time_features", CyclicalDateTimeFeatures()),
    ]
)

transformed_data = transforms.transform(data)
transformed_data.head()

Unnamed: 0_level_0,temperature,energy,time_of_day,day_of_year,sin_time_of_day,cos_time_of_day,sin_day_of_year,cos_day_of_year
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-01-01 06:15:00,40.987233,43.012862,6.25,1,0.997859,-0.065403,0.017166,0.999853
2014-01-01 06:30:00,41.007768,43.780204,6.5,1,0.991445,-0.130526,0.017166,0.999853
2014-01-01 06:45:00,41.002971,43.012709,6.75,1,0.980785,-0.19509,0.017166,0.999853
2014-01-01 07:00:00,41.0081,42.631804,7.0,1,0.965926,-0.258819,0.017166,0.999853
2014-01-01 07:15:00,41.005961,42.627307,7.25,1,0.94693,-0.321439,0.017166,0.999853


In [8]:
class WeeklyValidator(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        pass
    

In [59]:
def run_weekly_validation(models, data):
    
    all_predictions = []
    
    feature_sets = [['time_of_day', 'day_of_year', 'temperature'], 
                    ['sin_time_of_day', 'cos_time_of_day', 'sin_day_of_year', 'cos_day_of_year', 'temperature']]
    # Iterate through features
    for feature_set in feature_sets:
        features='standard' if 'sin_time_of_day' not in feature_set else 'cyclical'
        print(f'Using features: {features}')
        # Subset to data
        X = data[feature_set + ['energy']].copy()
        
        # Iterate through models
        for model in models:
            model_name = model.__class__.__name__
            print(f'Using model: {model_name}')
            
            # Iterate through weeks in the dataset
            # Must group by string formatted week and year
            for (week, year), X_test in tqdm.tqdm(X.groupby([X.index.strftime('%U'), X.index.strftime('%Y')]), desc='Weeks'):
                
                # Subset to training data
                X_train = X[X.index < X_test.index.min()].copy()
                
                # Can not train or test on zero observations
                if len(X_train) == 0 or len(X_test) == 0:
                    continue
                    
                # Targets
                y_train = X_train.pop('energy')
                y_test = X_test.pop('energy')
                
                model.fit(X_train, y_train)
                predictions = model.predict(X_test)
                
                # Record predictions along with actual values, model, and feature set in a dataframe
                predictions = pd.DataFrame(dict(predicted=predictions,
                                                actual=y_test, 
                                                model=model_name, 
                                                features=features),
                                           index=X_test.index)
                
                all_predictions.append(predictions)
    # Return list of dataframes
    return all_predictions

In [48]:
import tqdm
import black

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Create linear model and random forest model for regression
models = [LinearRegression(n_jobs=-1), RandomForestRegressor(n_estimators=100, max_depth=None, n_jobs=-1, random_state=100)]
# validation = run_weekly_validation(models, data)

In [None]:
def run_all_buildings(building_data_files):
    # Run validation for all buildings
    for building_file_name in tqdm.tqdm(building_data_files, desc='Buildings'):
        building_data = pd.read_csv(building_file_name, parse_dates=['timestamp']).set_index('timestamp')
        # Create sets of features
        building_data = transforms.transform(building_data)
        
        # Run the validation and save the results
        building_validation = run_weekly_validation(models, building_data)
        # Convert from list of dataframes to single dataframe
        building_validation = pd.concat(building_validation).reset_index().sort_values(['model', 'features', 'timestamp']).set_index('timestamp')
        # Save off results for analysis
        building_validation.to_csv(f"{building_file_name.replace('energy_data', 'validation_results').replace('data', 'validation_results')}")
                                   
run_all_buildings(building_data_files)








Buildings:   0%|                                                                                                                                   | 0/40 [00:00<?, ?it/s]

Using features: standard
Using model: LinearRegression


In [41]:
validation = pd.concat(validation)
validation.head()
validation.tail()

Unnamed: 0_level_0,predicted,actual,model,features
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-02 22:45:00,135.545103,71.810619,RandomForestRegressor,cyclical
2016-01-02 23:00:00,144.112047,96.005072,RandomForestRegressor,cyclical
2016-01-02 23:15:00,144.322802,93.70443,RandomForestRegressor,cyclical
2016-01-02 23:30:00,143.739093,94.85349,RandomForestRegressor,cyclical
2016-01-02 23:45:00,140.805196,92.93116,RandomForestRegressor,cyclical


In [43]:
validation['model'].unique()
validation['features'].unique()

array(['LinearRegression', 'RandomForestRegressor'], dtype=object)

array(['standard', 'cyclical'], dtype=object)

In [42]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [38]:
validation['model'].unique()

array(['LinearRegression'], dtype=object)

In [11]:
def calculate_results(results):
    pass

In [12]:
def graph_results(results):
    pass