In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
#import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
init_notebook_mode(connected=True)

In [25]:
train = pd.read_csv('train.csv')
weather_train = pd.read_csv('weather_train.csv')
test = pd.read_csv('test.csv')
weather_test = pd.read_csv('weather_test.csv')
building_metadata = pd.read_csv('building_metadata.csv')
#sample_submission = pd.read_csv('sample_submission.csv')


In [26]:
building_metadata.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [27]:
weather_train.head(3)

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0


In [28]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [29]:
##########Reducing memory########################
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
weather_train = reduce_mem_usage(weather_train)
weather_test = reduce_mem_usage(weather_test)
building_meta = reduce_mem_usage(building_metadata)


Mem. usage decreased to 234.13 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [30]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


<h2> Merge building_metadata with Train and Test </h2>

In [31]:
train_df = train.merge(building_metadata, on='building_id', how='left')
train = train_df.merge(weather_train, on=['site_id', 'timestamp'], how='left')
train.head(5)
test_df = test.merge(building_metadata, on='building_id', how='left')
test = test_df.merge(weather_test, on=['site_id', 'timestamp'], how='left')
test.head(5)

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01 00:00:00,0,Education,7432,2008.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,1,1,0,2017-01-01 00:00:00,0,Education,2720,2004.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
2,2,2,0,2017-01-01 00:00:00,0,Education,5376,1991.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
3,3,3,0,2017-01-01 00:00:00,0,Education,23685,2002.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
4,4,4,0,2017-01-01 00:00:00,0,Education,116607,1975.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609


In [32]:
train_df.head(3)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,


<h3> Light GB model</h3>

In [33]:
train.drop('timestamp',axis=1,inplace=True)
test.drop('timestamp',axis=1,inplace=True)

In [34]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [39]:
columns = ['air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed']
train.loc[:, columns] = train.loc[:, columns].interpolate(method ='linear', limit_direction ='forward') 
test.loc[:, columns] = test.loc[:, columns].interpolate(method ='linear', limit_direction ='forward') 

In [40]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [41]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
le = LabelEncoder()

train['meter']= le.fit_transform(train['meter']).astype("uint8")
test['meter']= le.fit_transform(test['meter']).astype("uint8")
train['primary_use']= le.fit_transform(train['primary_use']).astype("uint8")
test['primary_use']= le.fit_transform(test['primary_use']).astype("uint8")

Wall time: 17.5 s


In [42]:
threshold = 0.9

In [43]:
correlation = train.corr().abs()
correlation.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
building_id,1.0,0.222097,0.009418,0.980923,0.058766,0.088186,0.244644,0.349324,0.293812,0.050449,0.065754,0.020894,0.094048,0.02198,0.023768
meter,0.222097,1.0,0.018949,0.253786,0.081953,0.132997,0.003765,0.177082,0.001526,0.07245,0.074484,0.002219,0.056413,0.02468,0.085853
meter_reading,0.009418,0.018949,1.0,0.012837,0.010953,0.02603,0.117408,0.134763,0.005879,0.010583,0.004619,0.000393,0.004157,0.000178,0.006647
site_id,0.980923,0.253786,0.012837,1.0,0.077566,0.091353,0.210556,0.338132,0.28414,0.01945,0.091735,0.021088,0.069183,0.021152,0.017192
primary_use,0.058766,0.081953,0.010953,0.077566,1.0,0.086802,0.069155,0.214756,0.04023,0.076369,0.003373,0.003605,0.033954,0.008688,0.04731


In [44]:
correlation.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
building_id,1.0,0.222097,0.009418,0.980923,0.058766,0.088186,0.244644,0.349324,0.293812,0.050449,0.065754,0.020894,0.094048,0.02198,0.023768
meter,0.222097,1.0,0.018949,0.253786,0.081953,0.132997,0.003765,0.177082,0.001526,0.07245,0.074484,0.002219,0.056413,0.02468,0.085853
meter_reading,0.009418,0.018949,1.0,0.012837,0.010953,0.02603,0.117408,0.134763,0.005879,0.010583,0.004619,0.000393,0.004157,0.000178,0.006647
site_id,0.980923,0.253786,0.012837,1.0,0.077566,0.091353,0.210556,0.338132,0.28414,0.01945,0.091735,0.021088,0.069183,0.021152,0.017192
primary_use,0.058766,0.081953,0.010953,0.077566,1.0,0.086802,0.069155,0.214756,0.04023,0.076369,0.003373,0.003605,0.033954,0.008688,0.04731


In [45]:
test_1 = correlation.where(np.triu(np.ones(correlation.shape), k=1).astype(np.bool))
test_1.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
building_id,,0.222097,0.009418,0.980923,0.058766,0.088186,0.244644,0.349324,0.293812,0.050449,0.065754,0.020894,0.094048,0.02198,0.023768
meter,,,0.018949,0.253786,0.081953,0.132997,0.003765,0.177082,0.001526,0.07245,0.074484,0.002219,0.056413,0.02468,0.085853
meter_reading,,,,0.012837,0.010953,0.02603,0.117408,0.134763,0.005879,0.010583,0.004619,0.000393,0.004157,0.000178,0.006647
site_id,,,,,0.077566,0.091353,0.210556,0.338132,0.28414,0.01945,0.091735,0.021088,0.069183,0.021152,0.017192
primary_use,,,,,,0.086802,0.069155,0.214756,0.04023,0.076369,0.003373,0.003605,0.033954,0.008688,0.04731


In [46]:
threshold=0.9

In [47]:
to_drop = [column for column in test_1.columns if any(test_1[column] > threshold)]

In [48]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0.0,0,0,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,0.0,0,0,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,0.0,0,0,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,0.0,0,0,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,0.0,0,0,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [49]:
#train.drop(to_drop,axis=1,inplace=True)
test.drop(to_drop,axis=1,inplace=True)
y = train['meter_reading']
train.drop('meter_reading',axis=1,inplace=True)

In [50]:
test.head()

Unnamed: 0,row_id,building_id,meter,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,0,7432,2008.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,1,1,0,0,2720,2004.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
2,2,2,0,0,5376,1991.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
3,3,3,0,0,23685,2002.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
4,4,4,0,0,116607,1975.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609


In [65]:
! pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/1f/cb/a8ec24334c35a7d0c87b4e4e056bd2137573c7c1bd81c760b79a2f370254/lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [51]:
cat_cols = ['building_id', 'primary_use','year_built', 'meter',  'wind_direction']

In [52]:
from sklearn.model_selection import train_test_split,KFold
import lightgbm as lgb
x_train,x_test,y_train,y_test = train_test_split(train,y,test_size=0.25,random_state=42)
print (x_train.shape)
print (y_train.shape)
print (x_test.shape)
print (y_test.shape)

lgb_train = lgb.Dataset(x_train, y_train ,categorical_feature=cat_cols)
lgb_test = lgb.Dataset(x_test, y_test ,categorical_feature=cat_cols)
del x_train, x_test , y_train, y_test

params = {'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'objective': 'regression',
          'max_depth': -1,
          'learning_rate': 0.15,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'rmse',
          "verbosity": -1,
          'reg_alpha': 0.5,
          'reg_lambda': 0.5,
          'random_state': 47,
          "num_leaves": 41}

(12275097, 14)
(12275097,)
(4091699, 14)
(4091699,)


In [64]:
reg = lgb.train(params, lgb_train, num_boost_round=3000, valid_sets=[lgb_train, lgb_test], early_stopping_rounds=100, verbose_eval = 100)


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 63488.9	valid_1's rmse: 88986.8
[200]	training's rmse: 53041.5	valid_1's rmse: 86112.5
[300]	training's rmse: 47558.6	valid_1's rmse: 85376.1
[400]	training's rmse: 41183.1	valid_1's rmse: 84684.6
[500]	training's rmse: 36266.2	valid_1's rmse: 84065.7
[600]	training's rmse: 33090.8	valid_1's rmse: 83617.5
[700]	training's rmse: 31348.1	valid_1's rmse: 83414.7
[800]	training's rmse: 29081.1	valid_1's rmse: 83113.1
[900]	training's rmse: 27056	valid_1's rmse: 82959
[1000]	training's rmse: 25449.1	valid_1's rmse: 82808.4
[1100]	training's rmse: 24168.6	valid_1's rmse: 82747.2
[1200]	training's rmse: 22947	valid_1's rmse: 82746.5
Early stopping, best iteration is:
[1157]	training's rmse: 23352.6	valid_1's rmse: 82667.3


In [53]:
test.drop('row_id',axis=1,inplace=True)

In [54]:
#del lgb_train,lgb_test

In [55]:
Submission_file = pd.DataFrame(test.index,columns=['row_id'])

In [77]:
Submission_file

Unnamed: 0,row_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [78]:
prediction = []
step = 100000
for i in range(0, len(test), step):
    prediction.extend(np.expm1(reg.predict(test.iloc[i: min(i+step, len(test)), :], num_iteration=reg.best_iteration)))
Submission_file['meter_reading'] = prediction
Submission_file['meter_reading'].clip(lower=0,upper=None,inplace=True)
Submission_file.to_csv("Output.csv",index=None)


overflow encountered in expm1


invalid value encountered in expm1



Wall time: 19min 57s
