## Main Points

* Each section is self-explanatory :) 
* Using KFold + LightGBM with 3 Splits
* Clean Code
* No use of UCL data leak
* Minimum Feature Engineering
* Minimum Memory Usages

## Import Packages

In [1]:
!pip install meteocalc
from meteocalc import feels_like
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import datetime
import gc
import warnings
warnings.simplefilter("ignore")

DATA_PATH = "../input/ashrae-energy-prediction/"

Collecting meteocalc
  Downloading https://files.pythonhosted.org/packages/6c/f7/95473a929f0a02547461fa3698b7f8082ff40445ba5e21601f5d9a5e48ec/meteocalc-1.1.0.tar.gz
Building wheels for collected packages: meteocalc
  Building wheel for meteocalc (setup.py) ... [?25l- \ done
[?25h  Created wheel for meteocalc: filename=meteocalc-1.1.0-cp36-none-any.whl size=8196 sha256=c86739eb411abae4f27c96d40c16c3eca79510e386099d87f25ef725490865ef
  Stored in directory: /tmp/.cache/pip/wheels/9e/34/13/83d36ecc28837e3c2a5b696542e697538e7c1025382f4ded55
Successfully built meteocalc
Installing collected packages: meteocalc
Successfully installed meteocalc-1.1.0


## Load Data

In [2]:
train_df = pd.read_csv(DATA_PATH + 'train.csv', parse_dates=['timestamp'])

# Remove outliers
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

building_df = pd.read_csv(DATA_PATH + 'building_metadata.csv')
weather_train = pd.read_csv(DATA_PATH+'/weather_train.csv')
weather_test = pd.read_csv(DATA_PATH+'/weather_test.csv')
weather_df = pd.concat([weather_train, weather_test], ignore_index=True)
del weather_train, weather_test
gc.collect()

44

## Utility Functions

In [3]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude
site_ids_offsets = pd.DataFrame({'site_id': 
                                 {0: 5,
                                  1: 0,
                                  2: 9,
                                  3: 6,
                                  4: 8,
                                  5: 0,
                                  6: 6,
                                  7: 6,
                                  8: 5,
                                  9: 7,
                                  10: 8,
                                  11: 6,
                                  12: 0,
                                  13: 7,
                                  14: 6,
                                  15: 6}})
def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df['offset'] = weather_df['site_id'].map(site_ids_offsets['site_id'])
    weather_df['datetime'] = (weather_df['datetime'] - pd.to_timedelta(weather_df['offset'], unit='H'))
    del weather_df['offset']
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
    
    def get_meteorological_features(data):
        def calculate_rh(df):
            df['relative_humidity'] = 100 * (np.exp((17.625 * df['dew_temperature']) / (243.04 + df['dew_temperature'])) / np.exp((17.625 * df['air_temperature'])/(243.04 + df['air_temperature'])))
        def calculate_fl(df):
            flike_final = []
            flike = []
            # calculate Feels Like temperature
            for i in range(len(df)):
                at = df['air_temperature'][i]
                rh = df['relative_humidity'][i]
                ws = df['wind_speed'][i]
                flike.append(feels_like(at, rh, ws))
            for i in range(len(flike)):
                flike_final.append(flike[i].f)
            df['feels_like'] = flike_final
            del flike_final, flike, at, rh, ws
        calculate_rh(data)
        calculate_fl(data)
        return data

    weather_df = get_meteorological_features(weather_df)
    
    return weather_df.drop(['sea_level_pressure', 'wind_direction', 'wind_speed'], axis=1)

In [4]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]
    df["is_holiday"] = (df.timestamp.isin(holidays)).astype(int)
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    # Remove Unused Columns
    drop = ["timestamp"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

## Fill Weather Information

I'm using [this kernel](https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling) to handle missing weather information.

In [5]:
weather_df = fill_weather_dataset(weather_df)

## Memory Reduction

In [6]:
train_df = reduce_mem_usage(train_df,use_float16=True)
building_df = reduce_mem_usage(building_df,use_float16=True)
weather_df['timestamp'] = weather_df['timestamp'].map(pd.to_datetime)
weather_df = reduce_mem_usage(weather_df,use_float16=True)

Memory usage of dataframe is 757.31 MB
Memory usage after optimization is: 435.45 MB
Decreased by 42.5%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 25.69 MB
Memory usage after optimization is: 8.43 MB
Decreased by 67.2%


## Merge Data

We need to add building and weather information into training dataset.

In [7]:
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])

## Features Engineering

In [8]:
train_df = features_engineering(train_df)

## Features & Target Variables

In [9]:
train_df['meter_reading'] = train_df['meter_reading'].map(np.log1p)
train_df = train_df[~train_df['meter_reading'].isnull()].reset_index(drop=True)
train_df['meter_reading'].isnull().sum()

0

In [10]:
target = train_df["meter_reading"].copy()
features = train_df.drop('meter_reading', axis = 1)
del train_df
gc.collect()

0

##  KFOLD LIGHTGBM Model

In [11]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "is_holiday", "weekend"]
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
oof = np.zeros(len(features))
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    oof[test_index] = model.predict(test_features, num_iteration=model.best_iteration)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.08327	valid_1's rmse: 1.23757
[50]	training's rmse: 0.88736	valid_1's rmse: 1.12763
[75]	training's rmse: 0.820613	valid_1's rmse: 1.11684
[100]	training's rmse: 0.782432	valid_1's rmse: 1.11786
[125]	training's rmse: 0.75475	valid_1's rmse: 1.12042
Early stopping, best iteration is:
[84]	training's rmse: 0.803955	valid_1's rmse: 1.11644
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.08208	valid_1's rmse: 1.19458
[50]	training's rmse: 0.894709	valid_1's rmse: 1.06438
[75]	training's rmse: 0.838289	valid_1's rmse: 1.04214
[100]	training's rmse: 0.806538	valid_1's rmse: 1.0349
[125]	training's rmse: 0.781416	valid_1's rmse: 1.03265
[150]	training's rmse: 0.762137	valid_1's rmse: 1.03243
[175]	training's rmse: 0.749642	valid_1's rmse: 1.03298
Early stopping, best iteration is:
[134]	training's rmse: 0.773201	valid_1's rmse: 1.03235
Training until validation scores don

In [12]:
print('oof_RMSE : ' ,np.sqrt(mean_squared_error(oof, target.values)))

oof_RMSE :  1.0978814674236186


In [13]:
del features, target
gc.collect()

0

## Important Features

## Load Test Data

In [14]:
test_df = pd.read_csv(DATA_PATH + 'test.csv', parse_dates=['timestamp'])
row_ids = test_df["row_id"]
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 596.49 MB
Decreased by 53.1%


In [15]:
test_df.drop("row_id", axis=1, inplace=True)

In [16]:
test_df = test_df.merge(building_df,left_on='building_id',right_on='building_id',how='left')
del building_df
gc.collect()

test_df = test_df.merge(weather_df,how='left',on=['timestamp','site_id'])
del weather_df
gc.collect()

test_df = features_engineering(test_df)

## Prediction

In [17]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test_df, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(test_df, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

In [18]:
del test_df, models
gc.collect()

0

## Submission

In [19]:
results_df = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(results, 0, a_max=None)})
del row_ids,results
gc.collect()

0

In [20]:
results_df.to_csv("submission.csv", index=False)
results_df

Unnamed: 0,row_id,meter_reading
0,0,158.207503
1,1,75.146335
2,2,14.353804
3,3,276.161465
4,4,1189.815077
...,...,...
41697595,41697595,6.779715
41697596,41697596,4.537068
41697597,41697597,2.853450
41697598,41697598,178.714939
