# Train Model based on meter_reading

In [1]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import datetime
from datetime import datetime as dt
import gc

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

# Needed Functions

In [2]:
# get missing data
def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

# fix missing weather data
def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
        
    return weather_df

# reduce memory
def reduce_mem_usage(df, use_float16=False):
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Load Training Data

In [3]:
PATH = PATH = './data/'

train_df = pd.read_csv(PATH + 'train.csv')

building_df = pd.read_csv(PATH + 'building_metadata.csv')

weather_df = pd.read_csv(PATH + 'weather_train.csv')

In [4]:
# fix weather data
weather_df = fill_weather_dataset(weather_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [5]:
# Outlier removal
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

# reduce Memory

In [6]:
train_df = reduce_mem_usage(train_df,use_float16=True)
building_df = reduce_mem_usage(building_df,use_float16=True)
weather_df = reduce_mem_usage(weather_df,use_float16=True)

Memory usage of dataframe is 757.31 MB
Memory usage after optimization is: 322.24 MB
Decreased by 57.4%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%
Memory usage of dataframe is 9.65 MB
Memory usage after optimization is: 2.66 MB
Decreased by 72.5%


In [7]:
# join data
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])

In [8]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,105,0,2016-01-01 00:00:00,23.3036,1,Education,50623,,5.0,3.800781,0.0,2.400391,0.0,1021.0,240.0,3.099609
1,106,0,2016-01-01 00:00:00,0.3746,1,Education,5374,,4.0,3.800781,0.0,2.400391,0.0,1021.0,240.0,3.099609
2,106,3,2016-01-01 00:00:00,0.0,1,Education,5374,,4.0,3.800781,0.0,2.400391,0.0,1021.0,240.0,3.099609
3,107,0,2016-01-01 00:00:00,175.184006,1,Education,97532,2005.0,10.0,3.800781,0.0,2.400391,0.0,1021.0,240.0,3.099609
4,108,0,2016-01-01 00:00:00,91.265297,1,Education,81580,1913.0,5.0,3.800781,0.0,2.400391,0.0,1021.0,240.0,3.099609


In [9]:
# holiday calendar import --> some kernels say that all buildings are located in the US
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# feature function
def select_features(df_given, drop_cols, build_lags_for, window=3):
    # copy data
    df = df_given.copy()
    # Sort by timestamp
    df.sort_values("timestamp")
    # reset index
    df.reset_index(drop=True)
    
    # seperate the timestamp into individual features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    
    df['hour'] = np.uint8(df['timestamp'].dt.hour)
    df['day'] = np.uint8(df['timestamp'].dt.day)
    df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
    df['month'] = np.uint8(df['timestamp'].dt.month)
    
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    # build in holiday feature
    min_date = pd.to_datetime(df["timestamp"].iloc[0],format="%Y-%m-%d")
    max_date = pd.to_datetime(df["timestamp"].iloc[-1],format="%Y-%m-%d")
    
    date_range = pd.date_range(start=min_date, end=max_date)
    us_holidays = calendar().holidays(start=date_range.min(), end=date_range.max())
    
    df['is_holiday'] = (df['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
    
    # build weekend classification (weekday starts at 0)
    df['is_weekend'] = np.where( (df["weekday"] == 5) | (df["weekday"] == 6), 1, 0 )
    
    # build lags
    if build_lags_for:
        rolled = df[build_lags_for].rolling(window=window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        
        for col in build_lags_for:
            df[f'{col}_mean_lag{window}'] = lag_mean[col]
        
    df = df.drop(drop_cols, axis=1)
    
    # transform the primary usage to a number
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    gc.collect()
    return df

In [10]:
# model build
def build_model(t, f, categorical, params, splits):
    # measure time
    total_start = dt.now()
    
    # t = target
    # f = features
    # categorical = categorical features of the dataset
    # params = model parameters --> used to change to find best fit
    # splits = kfolf number of splits
    
    kf = KFold(n_splits=splits)
    # collect models
    models = []
    
    for train_index,test_index in kf.split(f):
        model_start = dt.now()
        # train
        train_features = f.loc[train_index]
        train_target = t.loc[train_index]
        
        # test
        test_features = f.loc[test_index]
        test_target = t.loc[test_index]
        
        d_train = lgb.Dataset(train_features, label=train_target, categorical_feature=categorical, free_raw_data=False)
        d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical, free_raw_data=False)
        # explanation:
        '''
            label: (list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)) – Label of the data.
            loading numpy array in the datasettt
            free_raw_data: saves memory if set to true --> i´m not concerned on my pc
            details:
            https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Dataset.html
        '''
        model = lgb.train(
             params, 
             train_set=d_train, 
             num_boost_round=1000, 
             valid_sets=[d_train,d_test], 
             verbose_eval=25, 
             early_stopping_rounds=50
        )
        # append model to list
        models.append(model)
        del train_features, train_target, test_features, test_target, d_train, d_test
        gc.collect()
        
        model_end = dt.now()
        print('KFold time:', model_end - model_start)
        
    total_end = dt.now()
    print('Total time:', total_end - total_start)
    
    return models

In [11]:
# build features
train_df_features = select_features(
    train_df,
    ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count",
     'precip_depth_1_hr', 'is_weekend', 'cloud_coverage', 'weekday'],
    False
)

In [12]:
train_df_features.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,air_temperature,dew_temperature,hour,day,month,is_holiday
0,105,0,23.3036,1,0,10.832181,3.800781,2.400391,0,1,1,1
1,106,0,0.3746,1,0,8.589514,3.800781,2.400391,0,1,1,1
2,106,3,0.0,1,0,8.589514,3.800781,2.400391,0,1,1,1
3,107,0,175.184006,1,0,11.487946,3.800781,2.400391,0,1,1,1
4,108,0,91.265297,1,0,11.309352,3.800781,2.400391,0,1,1,1


In [13]:
# seperate data into meter types
train_df_features_0 = train_df_features[ (train_df_features['meter'] == 0) ]
train_df_features_1 = train_df_features[ (train_df_features['meter'] == 1) ]
train_df_features_2 = train_df_features[ (train_df_features['meter'] == 2) ]
train_df_features_3 = train_df_features[ (train_df_features['meter'] == 3) ]

In [14]:
# params
categorical_features = [
    "building_id", "site_id", 'primary_use', 'hour', 'day', 'month', 'meter','is_holiday'
]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 70,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}


In [15]:
# model meter = 0
target_0 = np.log1p(train_df_features_0["meter_reading"])
features_0 = train_df_features_0.drop('meter_reading', axis = 1)

test_0 = build_model(target_0, features_0, categorical_features, params, 3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.774465	valid_1's rmse: 0.796547
[50]	training's rmse: 0.437352	valid_1's rmse: 0.513269
[75]	training's rmse: 0.361814	valid_1's rmse: 0.480464
[100]	training's rmse: 0.32707	valid_1's rmse: 0.475139
[125]	training's rmse: 0.30493	valid_1's rmse: 0.472645
[150]	training's rmse: 0.290375	valid_1's rmse: 0.472929
[175]	training's rmse: 0.278727	valid_1's rmse: 0.473615
Early stopping, best iteration is:
[128]	training's rmse: 0.303427	valid_1's rmse: 0.472555
KFold time: 0:00:57.623770
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.789774	valid_1's rmse: 0.749478
[50]	training's rmse: 0.459188	valid_1's rmse: 0.436302
[75]	training's rmse: 0.382549	valid_1's rmse: 0.396007
[100]	training's rmse: 0.345054	valid_1's rmse: 0.391902
[125]	training's rmse: 0.320411	valid_1's rmse: 0.389116
[150]	training's rmse: 0.303798	valid_1's rmse: 0.388411
[175]	training's rmse: 0.2

In [17]:
# model meter = 1
target_1 = np.log1p(train_df_features_1["meter_reading"])
features_1 = train_df_features_1.drop('meter_reading', axis = 1)

test_1 = build_model(target_1, features_1, categorical_features, params, 3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.655676	valid_1's rmse: 0.607735
[50]	training's rmse: 0.409242	valid_1's rmse: 0.515308
[75]	training's rmse: 0.347318	valid_1's rmse: 0.509879
[100]	training's rmse: 0.323444	valid_1's rmse: 0.508482
[125]	training's rmse: 0.307851	valid_1's rmse: 0.508375
[150]	training's rmse: 0.29746	valid_1's rmse: 0.508265
Early stopping, best iteration is:
[106]	training's rmse: 0.319243	valid_1's rmse: 0.507956
KFold time: 0:00:16.789942
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.646679	valid_1's rmse: 0.632934
[50]	training's rmse: 0.417553	valid_1's rmse: 0.470837
[75]	training's rmse: 0.361799	valid_1's rmse: 0.449158
[100]	training's rmse: 0.338581	valid_1's rmse: 0.444437
[125]	training's rmse: 0.322761	valid_1's rmse: 0.441989
[150]	training's rmse: 0.311782	valid_1's rmse: 0.440381
[175]	training's rmse: 0.301519	valid_1's rmse: 0.438865
[200]	training's rmse: 0.

In [18]:
# model meter = 2
target_2 = np.log1p(train_df_features_2["meter_reading"])
features_2 = train_df_features_2.drop('meter_reading', axis = 1)

test_2 = build_model(target_2, features_2, categorical_features, params, 3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.703757	valid_1's rmse: 0.788235
[50]	training's rmse: 0.344217	valid_1's rmse: 0.479491
[75]	training's rmse: 0.273145	valid_1's rmse: 0.442246
[100]	training's rmse: 0.251825	valid_1's rmse: 0.437213
[125]	training's rmse: 0.23807	valid_1's rmse: 0.436948
[150]	training's rmse: 0.227956	valid_1's rmse: 0.434573
[175]	training's rmse: 0.222068	valid_1's rmse: 0.433653
[200]	training's rmse: 0.217391	valid_1's rmse: 0.433152
[225]	training's rmse: 0.213292	valid_1's rmse: 0.432893
[250]	training's rmse: 0.209685	valid_1's rmse: 0.432756
[275]	training's rmse: 0.206174	valid_1's rmse: 0.432638
[300]	training's rmse: 0.203205	valid_1's rmse: 0.432352
[325]	training's rmse: 0.200295	valid_1's rmse: 0.432229
[350]	training's rmse: 0.197876	valid_1's rmse: 0.432144
[375]	training's rmse: 0.19582	valid_1's rmse: 0.4321
[400]	training's rmse: 0.193744	valid_1's rmse: 0.431889
[425]	training's rmse: 0.191521	va

In [19]:
# model meter = 3
target_3 = np.log1p(train_df_features_3["meter_reading"])
features_3 = train_df_features_3.drop('meter_reading', axis = 1)

test_3 = build_model(target_3, features_3, categorical_features, params, 3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.438294	valid_1's rmse: 0.43614
[50]	training's rmse: 0.248263	valid_1's rmse: 0.291504
[75]	training's rmse: 0.208918	valid_1's rmse: 0.278918
[100]	training's rmse: 0.193178	valid_1's rmse: 0.278378
[125]	training's rmse: 0.182221	valid_1's rmse: 0.278574
[150]	training's rmse: 0.175375	valid_1's rmse: 0.278243
[175]	training's rmse: 0.169775	valid_1's rmse: 0.278188
[200]	training's rmse: 0.165154	valid_1's rmse: 0.278507
Early stopping, best iteration is:
[161]	training's rmse: 0.172386	valid_1's rmse: 0.27813
KFold time: 0:00:04.990121
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 0.437489	valid_1's rmse: 0.44987
[50]	training's rmse: 0.249931	valid_1's rmse: 0.284946
[75]	training's rmse: 0.212139	valid_1's rmse: 0.263482
[100]	training's rmse: 0.197353	valid_1's rmse: 0.259568
[125]	training's rmse: 0.187636	valid_1's rmse: 0.257743
[150]	training's rmse: 0.18

# OK, this looks better
# Load test data

In [22]:
test_df = pd.read_csv(PATH + 'test.csv')

# split data
test_df_0 = test_df[ (test_df['meter'] == 0) ]
test_df_0_row_ids = test_df_0['row_id']
test_df_0.drop("row_id", axis=1, inplace=True)

test_df_1 = test_df[ (test_df['meter'] == 1) ]
test_df_1_row_ids = test_df_1['row_id']
test_df_1.drop("row_id", axis=1, inplace=True)

test_df_2 = test_df[ (test_df['meter'] == 2) ]
test_df_2_row_ids = test_df_2['row_id']
test_df_2.drop("row_id", axis=1, inplace=True)

test_df_3 = test_df[ (test_df['meter'] == 3) ]
test_df_3_row_ids = test_df_3['row_id']
test_df_3.drop("row_id", axis=1, inplace=True)


test_df_0 = reduce_mem_usage(test_df_0)
test_df_1 = reduce_mem_usage(test_df_1)
test_df_2 = reduce_mem_usage(test_df_2)
test_df_3 = reduce_mem_usage(test_df_3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Memory usage of dataframe is 755.49 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Memory usage after optimization is: 307.67 MB
Decreased by 59.3%
Memory usage of dataframe is 266.26 MB
Memory usage after optimization is: 108.93 MB
Decreased by 59.1%
Memory usage of dataframe is 173.23 MB
Memory usage after optimization is: 71.13 MB
Decreased by 58.9%
Memory usage of dataframe is 77.53 MB
Memory usage after optimization is: 32.25 MB
Decreased by 58.4%


In [24]:
# merge data
test_df_0 = test_df_0.merge(building_df,left_on='building_id',right_on='building_id',how='left')
test_df_1 = test_df_1.merge(building_df,left_on='building_id',right_on='building_id',how='left')
test_df_2 = test_df_2.merge(building_df,left_on='building_id',right_on='building_id',how='left')
test_df_3 = test_df_3.merge(building_df,left_on='building_id',right_on='building_id',how='left')

In [25]:
# get test weather and fill it
weather_df = pd.read_csv(PATH + 'weather_test.csv')

weather_df = fill_weather_dataset(weather_df)

weather_df = reduce_mem_usage(weather_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Memory usage of dataframe is 19.25 MB
Memory usage after optimization is: 9.05 MB
Decreased by 53.0%


In [26]:
# merge with weather
test_df_0 = test_df_0.merge(weather_df,how='left',on=['timestamp','site_id'])
test_df_1 = test_df_1.merge(weather_df,how='left',on=['timestamp','site_id'])
test_df_2 = test_df_2.merge(weather_df,how='left',on=['timestamp','site_id'])
test_df_3 = test_df_3.merge(weather_df,how='left',on=['timestamp','site_id'])

In [29]:
# build features
test_df_0_features = select_features(
    test_df_0,
    ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count",
     'precip_depth_1_hr', 'is_weekend', 'cloud_coverage', 'weekday'],
    False
)

test_df_1_features = select_features(
    test_df_1,
    ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count",
     'precip_depth_1_hr', 'is_weekend', 'cloud_coverage', 'weekday'],
    False
)

test_df_2_features = select_features(
    test_df_2,
    ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count",
     'precip_depth_1_hr', 'is_weekend', 'cloud_coverage', 'weekday'],
    False
)

test_df_3_features = select_features(
    test_df_3,
    ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count",
     'precip_depth_1_hr', 'is_weekend', 'cloud_coverage', 'weekday'],
    False
)

# Prediction for each meter

In [30]:
prediction_start = dt.now()

results_0 = []
for model in test_0:
    if  results_0 == []:
        results_0 = np.expm1(model.predict(test_df_0_features, num_iteration=model.best_iteration)) / len(test_0)
    else:
        results_0 += np.expm1(model.predict(test_df_0_features, num_iteration=model.best_iteration)) / len(test_0)
    del model
    gc.collect()
    
print('Prediction Time:', dt.now() - prediction_start)

  """


Prediction Time: 0:02:59.425484


In [31]:
prediction_start = dt.now()

results_1 = []
for model in test_1:
    if  results_1 == []:
        results_1 = np.expm1(model.predict(test_df_1_features, num_iteration=model.best_iteration)) / len(test_1)
    else:
        results_1 += np.expm1(model.predict(test_df_1_features, num_iteration=model.best_iteration)) / len(test_1)
    del model
    gc.collect()
    
print('Prediction Time:', dt.now() - prediction_start)

  """


Prediction Time: 0:00:54.007087


In [32]:
prediction_start = dt.now()

results_2 = []
for model in test_2:
    if  results_2 == []:
        results_2 = np.expm1(model.predict(test_df_2_features, num_iteration=model.best_iteration)) / len(test_2)
    else:
        results_2 += np.expm1(model.predict(test_df_2_features, num_iteration=model.best_iteration)) / len(test_2)
    del model
    gc.collect()
    
print('Prediction Time:', dt.now() - prediction_start)

  """


Prediction Time: 0:00:44.699859


In [33]:
prediction_start = dt.now()

results_3 = []
for model in test_3:
    if  results_3 == []:
        results_3 = np.expm1(model.predict(test_df_3_features, num_iteration=model.best_iteration)) / len(test_3)
    else:
        results_3 += np.expm1(model.predict(test_df_3_features, num_iteration=model.best_iteration)) / len(test_3)
    del model
    gc.collect()
    
print('Prediction Time:', dt.now() - prediction_start)

  """


Prediction Time: 0:00:13.699037


In [34]:
# create result dataframes
results_df_0 = pd.DataFrame({"row_id": test_df_0_row_ids, "meter_reading": np.clip(results_0, 0, a_max=None)})
results_df_1 = pd.DataFrame({"row_id": test_df_1_row_ids, "meter_reading": np.clip(results_1, 0, a_max=None)})
results_df_2 = pd.DataFrame({"row_id": test_df_2_row_ids, "meter_reading": np.clip(results_2, 0, a_max=None)})
results_df_3 = pd.DataFrame({"row_id": test_df_3_row_ids, "meter_reading": np.clip(results_3, 0, a_max=None)})

In [56]:
results_df = pd.concat(
    [results_df_0, results_df_1, results_df_2, results_df_3]
)

In [59]:
results_df = results_df.sort_values(by=['row_id'])

In [64]:
# check duplicates
t = results_df[results_df['row_id'].duplicated()]
t.head()

Unnamed: 0,row_id,meter_reading


In [65]:
results_df.to_csv("submission_03.csv", index=False)