In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.utils import to_categorical
import matplotlib.pyplot as plt

Let's see the sample submission 

In [0]:
sample = pd.read_csv('sample_submission.csv')
print(sample.head())

Importing test data

In [0]:
test = pd.read_csv('test.csv')
test['timestamp'] = pd.to_datetime(test['timestamp'])
print(test.head())

Importing weather data for test

In [0]:
weather_test = pd.read_csv('weather_test.csv')
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])

In [0]:
weather_cols = weather_test.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
weather_test = imp.fit_transform(weather_test)

In [0]:
weather_test = pd.DataFrame(data=weather_test, columns=weather_cols)

Scaling the weather_test dataset

In [0]:
scaler = StandardScaler()
weather_test[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']] = scaler.fit_transform(weather_test[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']]) 

In [0]:
print(weather_test.head())

Importing traing and weather_traing data

In [0]:
train = pd.read_csv('train.csv')
weather_train = pd.read_csv('weather_train.csv')

Importing metadata

In [0]:
metadata = pd.read_csv('building_metadata.csv')

Typecasting of column 'timestamp'

In [0]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'])


In [0]:
weather_cols = weather_train.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
weather_train = imp.fit_transform(weather_train)
weather_train = pd.DataFrame(data=weather_train, columns=weather_cols)

Scaling

In [0]:
scaler = StandardScaler()
weather_train[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']] = scaler.fit_transform(weather_train[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']])

To handle categorical data

In [0]:
metadata = pd.get_dummies(metadata)

In [0]:
metadata_cols = metadata.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
metadata = imp.fit_transform(metadata)
metadata = pd.DataFrame(data=metadata, columns=metadata_cols)

Scaling metadata

In [0]:
scaler = StandardScaler()
metadata[['square_feet', 'floor_count']] = scaler.fit_transform(metadata[['square_feet', 'floor_count']])

Joining weather_train and metadata

In [0]:
weather_metadata_train = pd.merge(weather_train, metadata, how='left', left_on=['site_id'], right_on=['site_id'])

Generating mini batches for training 

In [0]:
def get_batches(batch_size, data, weather_metadata):
    i = 0
    while i+batch_size < len(train):
        batch_data = data.iloc[i:i+batch_size,:]
        #joining minibatch of train data with weather_metadata_train
        batch_data = pd.merge(batch_data, weather_metadata, how='left', left_on=['timestamp', 'building_id'], right_on=['timestamp', 'building_id'])
        batch_data_X = batch_data.drop(columns=['meter_reading'])
        batch_data_X['year'] = pd.DatetimeIndex(batch_data_X['timestamp']).year
        batch_data_X['age'] = np.array(batch_data_X.year)- np.array(batch_data_X.year_built)
        batch_data_X = batch_data_X.drop(columns=['timestamp', 'year_built', 'year', 'site_id', 'building_id'])
        batch_data_X_col = batch_data_X.columns
        imp = SimpleImputer(missing_values = np.nan, strategy='median')
        batch_data_X = imp.fit_transform(batch_data_X)
        batch_data_X = pd.DataFrame(data=batch_data_X, columns=batch_data_X_col)
        batch_data_y = batch_data['meter_reading']
        yield batch_data_X, batch_data_y
        i += batch_size

In [0]:
mini_batches = get_batches(40000, train, weather_metadata_train)

model = SGDRegressor(loss = 'squared_loss', penalty = 'l2')

count = 0 
for train_X, train_y in mini_batches:
    print(count)
    count+=1
    model.partial_fit(train_X, train_y)
    print(model.score(train_X, train_y))
   
print(model.score(train_X, train_y))

Joining metadata and weather_test data

In [0]:
weather_metadata_test = pd.merge(weather_test, metadata, how='left', left_on=['site_id'], right_on=['site_id'])

In [0]:
def get_batches_test(batch_size, data, weather_metadata):
    i = 0
    while i+batch_size < len(train):
        batch_data = data.iloc[i:i+batch_size,:]
        batch_data = pd.merge(batch_data, weather_metadata, how='left', left_on=['timestamp', 'building_id'], right_on=['timestamp', 'building_id'])
        batch_data_X['year'] = pd.DatetimeIndex(batch_data_X['timestamp']).year
        batch_data_X['age'] = np.array(batch_data_X.year)- np.array(batch_data_X.year_built)
        batch_data_X = batch_data_X.drop(columns=['timestamp', 'year_built', 'year', 'site_id', 'building_id'])
        batch_data_X_col = batch_data_X.columns
        imp = SimpleImputer(missing_values = np.nan, strategy='median')
        batch_data_X = imp.fit_transform(batch_data_X)
        batch_data_X = pd.DataFrame(data=batch_data_X, columns=batch_data_X_col)
        yield batch_data_X
        i += batch_size

In [0]:
mini_batches = get_batches_test(40000, test, weather_metadata_test)
count = 0 
preds = []
for test_X in mini_batches:
    print(count)
    count+=1
    preds = preds+list(model.predict(test_X))