# Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from tqdm import tqdm
import gc

In [2]:
# Code from https://www.kaggle.com/caesarlupum/ashrae-start-here-a-gentle-introduction 

# Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Read Input
Read only training data to avoid RAM overhead

In [3]:
InputPath = "../input/ashrae-energy-prediction"
train_df = pd.read_csv(InputPath+'/train.csv')
building_df = pd.read_csv(InputPath+'/building_metadata.csv')
weather_train_df = pd.read_csv(InputPath+'/weather_train.csv')

# **EDA**

In [4]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [5]:
train_df.describe()

Unnamed: 0,building_id,meter,meter_reading
count,20216100.0,20216100.0,20216100.0
mean,799.278,0.6624412,2117.121
std,426.9133,0.9309921,153235.6
min,0.0,0.0,0.0
25%,393.0,0.0,18.3
50%,895.0,0.0,78.775
75%,1179.0,1.0,267.984
max,1448.0,3.0,21904700.0


In [6]:
building_df.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [7]:
building_df.describe()

Unnamed: 0,site_id,building_id,square_feet,year_built,floor_count
count,1449.0,1449.0,1449.0,675.0,355.0
mean,6.952381,724.0,92111.776398,1967.957037,3.740845
std,5.003432,418.434583,110769.950997,31.05403,3.333683
min,0.0,0.0,283.0,1900.0,1.0
25%,3.0,362.0,23012.0,1949.0,1.0
50%,5.0,724.0,57673.0,1970.0,3.0
75%,13.0,1086.0,115676.0,1995.0,5.0
max,15.0,1448.0,875000.0,2017.0,26.0


In [8]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [9]:
weather_train_df.describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,139773.0,139718.0,70600.0,139660.0,89484.0,129155.0,133505.0,139469.0
mean,7.478977,14.418106,2.149306,7.350158,0.983047,1016.158038,180.526632,3.560527
std,4.604744,10.626595,2.59915,9.790235,8.463678,7.629684,111.523629,2.335874
min,0.0,-28.9,0.0,-35.0,-1.0,968.2,0.0,0.0
25%,3.0,7.2,0.0,0.6,0.0,1011.8,80.0,2.1
50%,7.0,15.0,2.0,8.3,0.0,1016.4,190.0,3.1
75%,11.0,22.2,4.0,14.4,0.0,1020.8,280.0,5.0
max,15.0,47.2,9.0,26.1,343.0,1045.5,360.0,19.0


In [10]:
#Reduce memory usage
train_df = reduce_mem_usage(df=train_df)
weather_train_df = reduce_mem_usage(df=weather_train_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)


# Merge Datasets

In [11]:

train = pd.merge(train_df,building_df,how = 'left')     
train = pd.merge(train,weather_train_df, on = ['site_id','timestamp'], how = 'left')
print(train.shape)

del train_df
del weather_train_df

(20216100, 16)


In [12]:
gc.collect()

11

**Change timestamp to type timestamp**

In [13]:
train['timestamp'] = pd.to_datetime(train.timestamp)

**Extract information from timestamp**

In [14]:
# Extracting date features from timestamp
train['year'] = train['timestamp'].dt.year
train['month'] = train['timestamp'].dt.month
train['day'] = train['timestamp'].dt.day
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek

In [15]:
#Reduce memory usage
train = reduce_mem_usage(df=train)

Mem. usage decreased to 1156.77 Mb (48.7% reduction)


**Now we can drop timestamp**

In [16]:
train = train.drop('timestamp',axis=1)

In [17]:
gc.collect()

56

**Encode primary_use using LabelEncoder **

In [18]:
le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

In [19]:
# Convert to categorical datatype
cat_cols = ['meter', 'primary_use', 'site_id', 'building_id', 'year', 'month', 'day', 'hour', 'dayofweek']
for col in cat_cols:
    train[col] = train[col].astype('category')

**Split train to target and features**

In [20]:
target = np.log1p(train["meter_reading"])
features = train.drop('meter_reading', axis = 1)

**Now we can delete train dataframe**

In [21]:
del train

In [22]:
gc.collect()

0

In [23]:
features = reduce_mem_usage(df=features)

Mem. usage decreased to 771.24 Mb (0.0% reduction)


# **KFOLD LIGHTGBM **

In [24]:
no_splits = 3
kf = KFold(no_splits)
LGBM = []
params = {
        "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",

}
for train,test1 in kf.split(features):
    train_features = features.loc[train]
    train_target = target.loc[train]
    
    test_features = features.loc[test1]
    test_target = target.loc[test1]
    
    training = lgb.Dataset(train_features, label=train_target,categorical_feature=cat_cols, free_raw_data=False)
    testing = lgb.Dataset(test_features, label=test_target,categorical_feature=cat_cols, free_raw_data=False)
    
    del train_features, train_target, test_features, test_target
    gc.collect()
    
    model = lgb.train(params, train_set=training, num_boost_round=1000, valid_sets=[training,testing], verbose_eval=25, early_stopping_rounds=50)
    LGBM.append(model)
    
    del training, testing
    gc.collect()





Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.0435	valid_1's rmse: 1.43964
[50]	training's rmse: 0.781126	valid_1's rmse: 1.29559
[75]	training's rmse: 0.685818	valid_1's rmse: 1.25442
[100]	training's rmse: 0.636625	valid_1's rmse: 1.23583
[125]	training's rmse: 0.601631	valid_1's rmse: 1.2262
[150]	training's rmse: 0.576668	valid_1's rmse: 1.22155
[175]	training's rmse: 0.557684	valid_1's rmse: 1.21923
[200]	training's rmse: 0.544121	valid_1's rmse: 1.21742
[225]	training's rmse: 0.533776	valid_1's rmse: 1.21706
[250]	training's rmse: 0.526075	valid_1's rmse: 1.21701
[275]	training's rmse: 0.519648	valid_1's rmse: 1.21729
Early stopping, best iteration is:
[235]	training's rmse: 0.530794	valid_1's rmse: 1.2169
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.03125	valid_1's rmse: 1.49865
[50]	training's rmse: 0.745036	valid_1's rmse: 1.47015
[75]	training's rmse: 0.658357	valid_1's rmse: 1.47022
Early stopping

In [25]:
#delete intermediate dataframes
del target
del features
del train
del test1
gc.collect()

0

# Read test data

In [26]:
test_df = pd.read_csv(InputPath+'/test.csv')
building_df = pd.read_csv(InputPath+'/building_metadata.csv')
weather_test_df = pd.read_csv(InputPath+'/weather_test.csv')

In [27]:
#drop row_id in test_df

test_df = test_df.drop(columns=['row_id'])
import gc
gc.collect()

0

In [28]:
# Reduce memory usage
test_df = reduce_mem_usage(df=test_df)
weather_test_df = reduce_mem_usage(df=weather_test_df)

Mem. usage decreased to 437.43 Mb (54.2% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)


**Merge datasets**

In [29]:
test = pd.merge(test_df,building_df,how = 'left')           
test = pd.merge(test,weather_test_df, on = ['site_id','timestamp'], how = 'left')
print(test.shape)

(41697600, 15)


**We dont need these dataframes anymore**

In [30]:
del test_df
del weather_test_df
del building_df

In [31]:
gc.collect()

0

**timestamp update on test data**

In [32]:
test['timestamp'] = pd.to_datetime(test.timestamp)

In [33]:
test['year'] = test['timestamp'].dt.year
test['month'] = test['timestamp'].dt.month
test['day'] = test['timestamp'].dt.day
test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek

In [34]:
#Reduce memory usage
test = reduce_mem_usage(df=test)

Mem. usage decreased to 2226.89 Mb (50.4% reduction)


In [35]:
#Drop timestamp from test
test = test.drop('timestamp',axis=1)

In [36]:
gc.collect()

56

**Label Encoder and categorical variables in test dataframe**

In [37]:
le = LabelEncoder()
test["primary_use"] = le.fit_transform(test["primary_use"])

In [38]:
# Convert to categorical datatype
cat_cols = ['meter', 'primary_use', 'site_id', 'building_id', 'year', 'month', 'day', 'hour', 'dayofweek']
for col in cat_cols:
    test[col] = test[col].astype('category')

# Prediction on test data

In [39]:
i=0
result=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    result.append(np.expm1(sum([model.predict(test.iloc[i:i+step_size]) for model in LGBM])/no_splits))
    i+=step_size
    gc.collect()

100%|██████████| 834/834 [2:09:26<00:00,  9.31s/it]


In [40]:
result = np.concatenate(result)

**Submit to csv**

In [41]:
submission = pd.read_csv(InputPath+'/sample_submission.csv')
submission['meter_reading'] = result
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv', index=False)