In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import random as rn
import pickle
from tqdm import tqdm
%matplotlib inline

In [2]:
import os
os.environ['PYTHONHASHSEED'] = '0'

In [3]:
np.random.seed(42)

In [4]:
rn.seed(42)

In [5]:
pd.options.display.max_columns = None

In [6]:
build=pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
df=pd.read_csv("../input/ashrae-energy-prediction/test.csv")
weather=pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")

In [7]:
from xgboost import XGBRegressor

In [8]:
with open('../input/ashare-v1/feature_changes','rb') as f:
    feature_changes=pickle.load(f)
with open("../input/ashare-v1/encoders.pkl",'rb') as f:
    encoders=pickle.load(f)
with open("../input/ashare-v1/total_columns.pkl",'rb') as f:
    total_columns=pickle.load(f)
with open('../input/ashare-v1/model_params.json','rb') as f:
    model_params=pickle.load(f)
xgb1=XGBRegressor(**model_params)
xgb1.load_model('../input/ashare-v1/xgboost_0')
xgb2=XGBRegressor(**model_params)
xgb2.load_model('../input/ashare-v1/xgboost_1')
xgb3=XGBRegressor(**model_params)
xgb3.load_model('../input/ashare-v1/xgboost_2')
xgb4=XGBRegressor(**model_params)
xgb4.load_model('../input/ashare-v1/xgboost_3')
xgb5=XGBRegressor(**model_params)
xgb5.load_model('../input/ashare-v1/xgboost_4')

In [9]:
def fill_cloud(site):
    if str(feature_changes['fill_null/weather/cloud_coverage'][site])=='nan':
        return feature_changes['fill_null/weather/cloud_coverage_cloud_fill_nan']
    else:
        return feature_changes['fill_null/weather/cloud_coverage'][site]
def fill_precip(site):
    if str(feature_changes['fill_null/weather/precip_depth_1_hr'][site])=='nan':
        return feature_changes['fill_null/weather/precip_depth_1_hr_precip_fill_nan']
    else:
        return feature_changes['fill_null/weather/precip_depth_1_hr'][site]

In [10]:
df=pd.merge(left=df,right=build,left_on='building_id',right_on='building_id',how='left')

In [11]:
del build
import gc
gc.collect()

26

In [12]:
df=pd.merge(left=df,right=weather,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')

In [13]:
del weather
gc.collect()

44

In [14]:
df.isna().sum()

row_id                       0
building_id                  0
meter                        0
timestamp                    0
site_id                      0
primary_use                  0
square_feet                  0
year_built            24598080
floor_count           34444320
air_temperature         221901
cloud_coverage        19542180
dew_temperature         260799
precip_depth_1_hr      7801563
sea_level_pressure     2516826
wind_direction         2978663
wind_speed              302089
dtype: int64

In [15]:
df['timestamp']=pd.to_datetime(df['timestamp'])
df['air_temperature'].fillna(value=feature_changes['fill_null/weather/air_temperature'],inplace=True)
df['sea_level_pressure'].fillna(value=feature_changes['fill_null/weather/sea_level_pressure'],inplace=True)
df['wind_speed'].fillna(value=feature_changes['fill_null/weather/wind_speed'],inplace=True)
df['dew_temperature'].fillna(value=feature_changes['fill_null/weather/dew_temperature'],inplace=True)
df['wind_direction'].fillna(value=feature_changes['fill_null/weather/wind_direction'],inplace=True)

In [16]:
df['day_of_week']=df['timestamp'].dt.weekday
df['hour']=df['timestamp'].dt.hour
df['week_of_year']=df['timestamp'].dt.isocalendar().week
df['week_end']=np.where(df['timestamp'].dt.weekday<=4,0,1)
df.drop('timestamp',axis=1,inplace=True)

In [17]:
df.loc[df['floor_count'].isnull()==True,'floor_count']=df.loc[df['floor_count'].isnull()==True,'primary_use'].map(feature_changes['fill_null/build/floor_count'])

In [18]:
df.drop(feature_changes['remove_cols/build'],axis=1,inplace=True)

In [19]:
df.loc[df['cloud_coverage'].isnull()==True,'cloud_coverage']=df.loc[df['cloud_coverage'].isnull()==True,'site_id'].apply(fill_cloud)

In [20]:
df.loc[df['precip_depth_1_hr'].isnull()==True,'precip_depth_1_hr']=df.loc[df['precip_depth_1_hr'].isnull()==True,'site_id'].apply(fill_precip)

In [21]:
df.isna().sum()

row_id                0
building_id           0
meter                 0
site_id               0
primary_use           0
square_feet           0
floor_count           0
air_temperature       0
cloud_coverage        0
dew_temperature       0
precip_depth_1_hr     0
sea_level_pressure    0
wind_direction        0
wind_speed            0
day_of_week           0
hour                  0
week_of_year          0
week_end              0
dtype: int64

In [22]:
total_columns

['building_id',
 'meter',
 'site_id',
 'primary_use',
 'square_feet',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'day_of_week',
 'hour',
 'week_of_year',
 'week_end']

In [35]:
sub=pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

In [24]:
final_sub=[]

In [25]:
for col,encoder in encoders.items():
    df[col]=encoder.transform(df[col])

In [26]:
df.head()

Unnamed: 0,row_id,building_id,meter,site_id,primary_use,square_feet,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,day_of_week,hour,week_of_year,week_end
0,0,0,0,0,0,7432,2.0,17.8,4.0,11.7,1.382557,1021.4,100.0,3.6,6,0,51,1
1,1,1,0,0,0,2720,2.0,17.8,4.0,11.7,1.382557,1021.4,100.0,3.6,6,0,51,1
2,2,2,0,0,0,5376,2.0,17.8,4.0,11.7,1.382557,1021.4,100.0,3.6,6,0,51,1
3,3,3,0,0,0,23685,2.0,17.8,4.0,11.7,1.382557,1021.4,100.0,3.6,6,0,51,1
4,4,4,0,0,0,116607,2.0,17.8,4.0,11.7,1.382557,1021.4,100.0,3.6,6,0,51,1


In [36]:
sub.head()

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [38]:
batch_size=1_00_000
n_batches=int(np.ceil(df.shape[0]/batch_size))
b=df.shape[0]
for batch in tqdm(range(n_batches)):
    a=batch*batch_size
    curr_b=(batch+1)*batch_size
    if b<=curr_b:
        curr_b=b
    temp_data=df.iloc[a:curr_b,:].reset_index(drop=True)
    temp_sub=sub.loc[sub['row_id'].isin(temp_data['row_id'].values),:]
    temp_sub.reset_index(drop=True,inplace=True)
    temp_data=temp_data[total_columns].values
    p1=np.expm1(xgb1.predict(temp_data))
    p2=np.expm1(xgb2.predict(temp_data))
    p3=np.expm1(xgb3.predict(temp_data))
    p4=np.expm1(xgb4.predict(temp_data))
    p5=np.expm1(xgb5.predict(temp_data))
    p=(p1+p2+p3+p4+p5)/5.0
    temp_sub['meter_reading']=p
    final_sub.append(temp_sub)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 417/417 [29:08<00:00,  4.19s/it]


In [39]:
final_submission=pd.concat(final_sub,ignore_index=True)

In [40]:
final_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,18.561251
1,1,14.36117
2,2,6.0053
3,3,42.369812
4,4,101.905449


In [None]:
final_submission.isna().sum()

In [41]:
final_submission.to_csv('submission.csv',index=False)