# ASHRAE ENERGY PREDICTION.

This competition is being hosted on kaggle and I have decided to participate. We are given sets of data containing weather and site information

Competition link: https://www.kaggle.com/c/ashrae-energy-prediction

Data set link: https://www.kaggle.com/c/ashrae-energy-prediction/data

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
import pickle
import gc
import matplotlib.pyplot as plt
import xgboost as xgb
# import torch
import warnings
warnings.filterwarnings("ignore")

Import the data. The data I used was split in a different kernel using pandas df['timestamp'].dt.***

In [None]:
df_train= pd.read_pickle("../data/input/df_train.pkl")
df_test = pd.read_pickle("../data/input/df_test.pkl")

In [None]:
df_test_2 = pd.read_pickle("../data/input/df_test.pkl")

In [None]:
df_test_2.head(6)

Function defined below is used to reduce the byte memory consumption.
It works by converting the data type of the columns into the minimum permitted data type in order to free up memory

In [None]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

We had a lot of missing data in the dataset. I filled the missing data with the global average value per day which is 22.4 for air temperature, 16 for dew temperature and converted the year feature into the difference between the year and 1900. Note: 1900 is the year the oldest building in the data was built

In [None]:
def fill_col(df, col, val):
    df[col] = df[col].fillna(val)
    return df

In [None]:
df_train = fill_col(df_train, "air_temperature", 22.4)
df_test = fill_col(df_test, "air_temperature", 22.4)

df_train['year_built'] = df_train['year_built'] - 1900
df_test['year_built'] = df_test['year_built'] - 1900

df_train = fill_col(df_train, 'dew_temperature', 16)
df_test = fill_col(df_test, 'dew_temperature', 16)

So for feature selection, after going through the competitions discussion, I noticed a correlation amongst the claims of high ranking competitors about which features were important to the model and which were not, I selected these features and used them for my model

In [None]:
good_feats_train = [
    'building_id',
    'square_feet',
    'meter',
    'meter_reading',
    'air_temperature',
    'dew_temperature',
    'floor_count',
    'primary_use',
    'year_built',
    'DT_hour',
    'site_id',
    'DT_day_week', # bottom3 FI
#     'cloud_coverage', # bottom3 FI
#     'precip_depth_1_hr' # bottom3 FI
]

good_feats_test = [
    'row_id',
    'building_id',
    'square_feet',
    'meter',
    'air_temperature',
    'dew_temperature',
    'floor_count',
    'primary_use',
    'year_built',
    'DT_hour',
    'site_id',
    'DT_day_week', # bottom3 FI
#     'cloud_coverage', # bottom3 FI
#     'precip_depth_1_hr' # bottom3 FI
]

In [None]:
def select_feats(df, feats):
    df = df[feats].copy()
    return df

# df_train_target = df_train['meter_reading'].copy()
df_train = select_feats(df_train, good_feats_train)
df_test = select_feats(df_test, good_feats_test)
# df_train['meter_reading'] = df_train_target
gc.collect()

Generally in machine learning competitions, it is advisable to convert your predictions target to reflect the evaluation being used in prediction loss. The competition stated that it uses the Root Mean Squared Logarithmic Error for evaluation so that is what was used to transform the meter readings.

In [None]:
df_train['meter_reading_log1p'] = np.log1p(df_train['meter_reading'])
df_train = df_train.drop("meter_reading", axis=1)

In [None]:
df_train.columns

### Feature Engineering

This is where the action starts.
As Andrew Ng said, "Applied machine learning is basically feature engineering"
After training the whole dataset using lgbm and checking the feature importance, the most important feature was the building id. This seems quite logical because energy consumption is based on building materials, number of people that contribute to the usage and so on.
I started by generating features based on building id

In [None]:
"""
Merge Columns to generate new features
"""
def generate_merge_feats(df):
    df['building_meter'] = (df['building_id'].map(str) + '_' + df['meter'].map(str)).astype('category')
    df['building_day_week'] = (df['building_id'].map(str) + '_' + df['DT_day_week'].map(str)).astype('category')
    return df

df_train = generate_merge_feats(df_train)
df_test = generate_merge_feats(df_test)

In [None]:
"""
Generate statistical based features
"""
gc.collect()

building_meter_group = df_train.groupby('building_meter')['meter_reading_log1p']
building_mean_per_meter = building_meter_group.transform('mean').astype(np.float16)
building_median_per_meter = building_meter_group.transform('median').astype(np.float16)
building_min_per_meter = building_meter_group.transform('min').astype(np.float16)
building_max_per_meter = building_meter_group.transform('max').astype(np.float16)
building_count_per_meter = building_meter_group.transform('count').astype(np.float16)

gc.collect()

In [None]:
"""
Map these features to their conterparts
"""
def map_stats_feats(df):
    df['building_mean_per_meter'] = df['building_id'].map(building_mean_per_meter)
    df['building_median_per_meter'] = df['building_id'].map(building_median_per_meter)
    df['building_min_per_meter'] = df['building_id'].map(building_min_per_meter)
    df['building_max_per_meter'] = df['building_id'].map(building_max_per_meter)
    df['building_count_per_meter'] = df['building_id'].map(building_count_per_meter)
    return df
    
df_train = map_stats_feats(df_train)
df_test = map_stats_feats(df_test)

In [None]:
df_train.head(10)

In [None]:
"""
Also create weather based features based on site id
"""
def create_weather_feats(data):
    data['air_temperature_mean'] = data.groupby('site_id')['air_temperature'].transform("mean").astype(np.float16)
    data['air_temperature_min'] = data.groupby('site_id')['air_temperature'].transform("min").astype(np.float16)
    data['air_temperature_max'] = data.groupby('site_id')['air_temperature'].transform("max").astype(np.float16)
    data['dew_temperature_mean'] = data.groupby('site_id')['dew_temperature'].transform("mean").astype(np.float16)
    data['dew_temperature_min'] = data.groupby('site_id')['dew_temperature'].transform("min").astype(np.float16)
    data['dew_temperature_max'] = data.groupby('site_id')['dew_temperature'].transform("max").astype(np.float16)
    return data
    
df_train = create_weather_feats(df_train)
df_test = create_weather_feats(df_test)

In [None]:
"""
Add rolling features of air temperature and dew temperature at varying windows
"""
def add_lag_features(df, window=3):
    group_df = df.groupby(['building_day_week'])
    cols = ['air_temperature', 'dew_temperature']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        df[f'{col}_mean_lag{window}'] = lag_mean[col]
        df[f'{col}_max_lag{window}'] = lag_max[col]
        df[f'{col}_min_lag{window}'] = lag_min[col]
        df[f'{col}_std_lag{window}'] = lag_std[col]
    return df
        

df_train = add_lag_features(df_train, window=72)
df_test = add_lag_features(df_test, window=72)
# df_train = add_lag_features(df_train, window=168)

In [None]:
df_train = df_train.drop(['meter_reading_log1p_std_lag72'], axis='columns')
df_test = df_test.drop(['meter_reading_log1p_std_lag72'], axis='columns')

In [None]:
df_train = df_train.drop(['dew_temperature_std_lag72'], axis='columns')
df_test = df_test.drop(['dew_temperature_std_lag72'], axis='columns')

Reduce memory usage

In [None]:
print('reducing mem usage for df_train...')
df_train = reduce_mem_usage(df_train, use_float16=True)

print('reducing mem usage for df_test...')
df_test = reduce_mem_usage(df_test, use_float16=True)

gc.collect()

### Training

Training of the lgb model will be done based on meter type. This is referred to as a batch. Each training batch will be trained using a time series split of 3 since this is a time series competition and each training split will produce a model which will be saved into a list. Predictions will then be made using these three models and averaged.

In [None]:
category_cols = ["building_id", "site_id", "primary_use", "building_meter", "building_day_week"]

In [None]:
def fetch_train_data_batch(data, target_meter):
    to_train = data.loc[data['meter'] == target_meter]
    to_train_target = to_train['meter_reading_log1p'].values
    to_train_data = to_train.drop('meter_reading_log1p', axis='columns')
    del to_train
    gc.collect()
    return to_train_data, to_train_target

def fetch_test_data_batch(test_data, target_meter):
    to_test_data = test_data.loc[test_data['meter'] == target_meter]
    return to_test_data
    

In [None]:
def fit_lgbm(train, val, devices=(-1), seed=None, cat_features=None, num_rounds=1500, lr=0.001, bf=0.1):
    """Function to train the Light GBM model"""
    X_tt, y_tt = train
    X_vl, y_vl = val
    metric = 'l2'
#     params = {
#         'objective':'regression',
#         'boosting_type':'gbdt',
#         'learning_rate':lr,
#         'num_leaves': 2**8,
#         'max_depth':20,
#         'n_estimators':5000,
#         'max_bin':255,
#         'num_leaves': 20,
#         'reg_alpha': 0.1,
#         'reg_lambda': 0.3,
#         'verbose':-1,
#         'seed': 42,
#         "bagging_freq": 5,
#         "bagging_fraction": bf,
#         "feature_fraction": 0.9,
#         "metric": metric,
#         'early_stopping_rounds':100
#     }
    params = {
        'objective':'regression',
        'boosting_type':'gbdt',
        'metric': metric,
        'learning_rate':0.07,
        'num_leaves': 2**8,
        'max_depth':20, #-1
        'colsample_bytree': 0.9,
        'subsample_freq':1,
        'subsample':0.5,
        'n_estimators':5000,
        'max_bin':255,
        'num_leaves': 20,
        'verbose':-1,
        'seed': 42,
        'early_stopping_rounds':100,
    }
    device = devices
    if device == -1:
        pass # use cpu
    else:
        print(f"using gpu {device}...") # use gpu
        params.update({'device': 'gpu', 'gpu_device_id': device})
    params["seed"] = seed
    d_train = lgb.Dataset(X_tt, label=y_tt, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_vl, label=y_vl, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]
    print("training LGB: ")
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=100)
    print("best score", model.best_score)
    log = {'train/mae': model.best_score['training'][metric],
           'valid/mae': model.best_score['valid_1'][metric]}
    return model, log

In [None]:
"""
3 things to change: target_meter, models0 and models0.append
"""
target_meter = 0
models0 = []

X_train, y_train = fetch_train_data_batch(df_train, target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])
gc.collect()
print("The target meter is : ", target_meter)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
tscv = TimeSeriesSplit(n_splits=3)

for train_idx, val_idx in tscv.split(X_train):
    train_data = X_train.iloc[train_idx], y_train[train_idx]
    val_data = X_train.iloc[val_idx], y_train[val_idx]
    print("Training: ", len(train_idx), " and validating: ", len(val_idx))
    model, log = fit_lgbm(train_data,val_data,cat_features=category_cols,num_rounds=1000,lr=0.05,bf=0.7)
    models0.append(model)
    gc.collect()
print("training has ended")
gc.collect()

In [None]:
"""
3 things to change: target_meter, models0 and models0.append
"""
target_meter = 1
models1 = []

X_train, y_train = fetch_train_data_batch(df_train, target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])
gc.collect()
print("The target meter is : ", target_meter)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
tscv = TimeSeriesSplit(n_splits=3)

for train_idx, val_idx in tscv.split(X_train):
    train_data = X_train.iloc[train_idx], y_train[train_idx]
    val_data = X_train.iloc[val_idx], y_train[val_idx]
    print("Training: ", len(train_idx), " and validating: ", len(val_idx))
    model, log = fit_lgbm(train_data,val_data,cat_features=category_cols,num_rounds=1000,lr=0.05,bf=0.7)
    models1.append(model)
    gc.collect()
print("training has ended")
gc.collect()

In [None]:
"""
3 things to change: target_meter, models0 and models0.append
"""
target_meter = 2
models2 = []

X_train, y_train = fetch_train_data_batch(df_train, target_meter)
gc.collect()
print("The target meter is : ", target_meter)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
tscv = TimeSeriesSplit(n_splits=3)

for train_idx, val_idx in tscv.split(X_train):
    train_data = X_train.iloc[train_idx], y_train[train_idx]
    val_data = X_train.iloc[val_idx], y_train[val_idx]
    print("Training: ", len(train_idx), " and validating: ", len(val_idx))
    model, log = fit_lgbm(train_data,val_data,cat_features=category_cols,num_rounds=1000,lr=0.05,bf=0.7)
    models2.append(model)
    gc.collect()
print("training for meter type has ended")

In [None]:
"""
3 things to change: target_meter, models0 and models0.append
"""
target_meter = 3
models3 = []

X_train, y_train = fetch_train_data_batch(df_train, target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])
gc.collect()
print("The target meter is : ", target_meter)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
tscv = TimeSeriesSplit(n_splits=3)

for train_idx, val_idx in tscv.split(X_train):
    train_data = X_train.iloc[train_idx], y_train[train_idx]
    val_data = X_train.iloc[val_idx], y_train[val_idx]
    print("Training: ", len(train_idx), " and validating: ", len(val_idx))
    model, log = fit_lgbm(train_data,val_data,cat_features=category_cols,num_rounds=1000,lr=0.05,bf=0.7)
    models3.append(model)
    gc.collect()
print("training has ended")
gc.collect()

In [None]:
lgb_models_v1 = [models0, models1, models2, models3]
pickle.dump(lgb_models_v1, open('../output/lgb_models_v1.pkl', 'wb'))

Predictions were made using the three models and the average was taken. It was then converted back to it's antilogarithmic form

In [None]:
def make_predictions(X_pred, models):
    X_pred_id = X_pred['row_id'] # returning the row_id in order to preserve order
    X_pred = X_pred.drop('row_id', axis='columns')
    preds_arr = []
    final_predictions_total = np.zeros(X_pred.shape[0])
    for i in range(len(models)):
        print(f"Making predictions for model {i}")
        estimator = models[i]
        predictions = estimator.predict(X_pred, num_iteration=estimator.best_iteration)
        final_predictions_total += predictions
    # Now we are done with the predictions, we'll take the average of the predictions
    final_predictions_total /= len(models)
    final_predictions_total = np.expm1(final_predictions_total)
    return X_pred_id, final_predictions_total

In [None]:
all_row_ids = []
all_predictions = []

gc.collect()

In [None]:
"""
To start predicting for the test set, 2 things to change
"""
target_meter = 0
use_model = models0

X_test = fetch_test_data_batch(df_test, target_meter)
y_test_id, y_test = make_predictions(X_test, use_model)

all_row_ids.append(y_test_id)
all_predictions.append(y_test)

In [None]:
"""
To start predicting for the test set, 2 things to change
"""
target_meter = 1
use_model = models1

X_test = fetch_test_data_batch(df_test, target_meter)
y_test_id, y_test = make_predictions(X_test, use_model)

all_row_ids.append(y_test_id)
all_predictions.append(y_test)

In [None]:
"""
To start predicting for the test set, 2 things to change
"""
target_meter = 2
use_model = models2

X_test = fetch_test_data_batch(df_test, target_meter)
y_test_id, y_test = make_predictions(X_test, use_model)

all_row_ids.append(y_test_id)
all_predictions.append(y_test)

In [None]:
"""
To start predicting for the test set, 2 things to change
"""
target_meter = 3
use_model = models3

X_test = fetch_test_data_batch(df_test, target_meter)
y_test_id, y_test = make_predictions(X_test, use_model)

all_row_ids.append(y_test_id)
all_predictions.append(y_test)

In [None]:
all_row_ids_flat = [ids for sublist in all_row_ids for ids in sublist]
all_predictions_flat = [preds for predsublist in all_predictions for preds in predsublist]

Create the submission in the appropriate format and submit to kaggle

In [None]:
submission = pd.DataFrame({})
submission['row_id'] = all_row_ids_flat
submission['meter_reading'] = all_predictions_flat

gc.collect()

In [None]:
submission.to_csv("../output/lgb_models_v2.csv", index=False)

In [None]:
!kaggle competitions submit ashrae-energy-prediction -f ../output/lgb_models_v2.csv -m "lgb_models_v2"