In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np 
import pandas as pd 
import zipfile
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDRegressor
import xgboost as xgb
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.optimizers import SGD, Adam
from keras.utils import to_categorical
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [3]:
from google.colab import files
files.upload()

{}

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c ashrae-energy-prediction

test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
weather_train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
weather_test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
building_metadata.csv: Skipping, found more recently modified local copy (use --force to force download)


Let's see the sample submission 

In [2]:
sample = zipfile.ZipFile('sample_submission.csv.zip') 
sample = pd.read_csv(sample.open('sample_submission.csv'))

#sample = pd.read_csv()
print(sample.head())

   row_id  meter_reading
0       0              0
1       1              0
2       2              0
3       3              0
4       4              0


Importing test data

In [3]:
test = zipfile.ZipFile('test.csv.zip')
test = pd.read_csv(test.open('test.csv'))
test['timestamp'] = pd.to_datetime(test['timestamp'])
print(test.head())

   row_id  building_id  meter  timestamp
0       0            0      0 2017-01-01
1       1            1      0 2017-01-01
2       2            2      0 2017-01-01
3       3            3      0 2017-01-01
4       4            4      0 2017-01-01


Importing weather data for test

In [0]:
weather_test = zipfile.ZipFile('weather_test.csv.zip')
weather_test = pd.read_csv(weather_test.open('weather_test.csv'))
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])

In [0]:
weather_cols = weather_test.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
weather_test = imp.fit_transform(weather_test)

In [0]:
weather_test = pd.DataFrame(data=weather_test, columns=weather_cols)

Scaling the weather_test dataset

In [0]:
scaler = StandardScaler()
weather_test[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']] = scaler.fit_transform(weather_test[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']]) 

In [8]:
print(weather_test.head())

  site_id           timestamp  ...  wind_direction  wind_speed
0       0 2017-01-01 00:00:00  ...       -0.633707    0.023114
1       0 2017-01-01 01:00:00  ...       -0.371033   -0.192775
2       0 2017-01-01 02:00:00  ...       -0.283475   -0.192775
3       0 2017-01-01 03:00:00  ...       -0.283475   -0.192775
4       0 2017-01-01 04:00:00  ...       -0.371033   -0.408664

[5 rows x 9 columns]


Importing traing and weather_traing data

In [0]:
train = zipfile.ZipFile('train.csv.zip')
train = pd.read_csv(train.open('train.csv'))
weather_train = zipfile.ZipFile('weather_train.csv.zip')
weather_train = pd.read_csv(weather_train.open('weather_train.csv'))

Importing metadata

In [0]:
metadata = pd.read_csv('building_metadata.csv')

Typecasting of column 'timestamp'

In [0]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'])


In [0]:
weather_cols = weather_train.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
weather_train = imp.fit_transform(weather_train)
weather_train = pd.DataFrame(data=weather_train, columns=weather_cols)

Scaling

In [0]:
scaler = StandardScaler()
weather_train[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']] = scaler.fit_transform(weather_train[['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']])

To handle categorical data

In [0]:
metadata = pd.get_dummies(metadata)

In [0]:
metadata_cols = metadata.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
metadata = imp.fit_transform(metadata)
metadata = pd.DataFrame(data=metadata, columns=metadata_cols)

Scaling metadata

In [0]:
scaler = StandardScaler()
metadata[['square_feet', 'floor_count']] = scaler.fit_transform(metadata[['square_feet', 'floor_count']])

Joining weather_train and metadata

In [0]:
weather_metadata = pd.merge(weather_train, metadata, how='left', left_on=['site_id'], right_on=['site_id'])

Generating mini batches for training 

In [0]:
def get_batches(batch_size, data, weather_metadata):
    i = 0
    while i+batch_size < len(train):
        batch_data = data.iloc[i:i+batch_size,:]
        #joining minibatch of train data with weather_metadata_train
        batch_data = pd.merge(batch_data, weather_metadata, how='left', left_on=['timestamp', 'building_id'], right_on=['timestamp', 'building_id'])
        batch_data_X = batch_data.drop(columns=['meter_reading'])
        batch_data_X['year'] = pd.DatetimeIndex(batch_data_X['timestamp']).year
        batch_data_X['age'] = np.array(batch_data_X.year)- np.array(batch_data_X.year_built)
        batch_data_X = batch_data_X.drop(columns=['timestamp', 'year_built', 'year', 'site_id', 'building_id'])
        batch_data_X_col = batch_data_X.columns
        imp = SimpleImputer(missing_values = np.nan, strategy='median')
        batch_data_X = imp.fit_transform(batch_data_X)
        batch_data_X = pd.DataFrame(data=batch_data_X, columns=batch_data_X_col)
        batch_data_y = batch_data['meter_reading']
        yield batch_data_X, batch_data_y
        i += batch_size

In [19]:
mini_batches = get_batches(40000, train, weather_metadata)

model = SGDRegressor(loss = 'squared_loss', penalty = 'l2')

count = 0 
for train_X, train_y in mini_batches:
    print(count)
    count+=1
    model.partial_fit(train_X, train_y)
    print(model.score(train_X, train_y))
   
print(model.score(train_X, train_y))

0
-8274777425.183967
1
-1169752442.7840047
2
-27689087040.358128
3
-5062099832.065434
4
-951214406.2915838
5
-504433833.0447643
6
-124268642.31890637
7
-60507875.96218524
8
-17478579.258489013
9
-14442439.871650409
10
-10227651.80297901
11
-9055167.889745949
12
-62962737.2124152
13
-7666248157.994316
14
-6895532609.926608
15
-4384421047.795102
16
-3884414284.1043496
17
-11336038285.739119


KeyboardInterrupt: ignored

Joining metadata and weather_test data

In [0]:
weather_metadata = pd.merge(weather_test, metadata, how='left', left_on=['site_id'], right_on=['site_id'])

In [0]:
def get_batches_test(batch_size, data, weather_metadata):
    i = 0
    while i+batch_size < len(train):
        batch_data = data.iloc[i:i+batch_size,:]
        batch_data = pd.merge(batch_data, weather_metadata, how='left', left_on=['timestamp', 'building_id'], right_on=['timestamp', 'building_id'])
        batch_data_X['year'] = pd.DatetimeIndex(batch_data_X['timestamp']).year
        batch_data_X['age'] = np.array(batch_data_X.year)- np.array(batch_data_X.year_built)
        batch_data_X = batch_data_X.drop(columns=['timestamp', 'year_built', 'year', 'site_id', 'building_id'])
        batch_data_X_col = batch_data_X.columns
        imp = SimpleImputer(missing_values = np.nan, strategy='median')
        batch_data_X = imp.fit_transform(batch_data_X)
        batch_data_X = pd.DataFrame(data=batch_data_X, columns=batch_data_X_col)
        yield batch_data_X
        i += batch_size

In [22]:
len(test)%20000

17600

In [23]:
len(test)  == len(sample)

True

In [0]:
mini_batches = get_batches_test(40000, test, weather_metadata)
count = 0 
preds = []
for test_X in mini_batches:
    print(count)
    sample['meter_reading'][count:count+40000] = list(model.predict(test_X))
    count+=20000
mini_batch = weather_metadata_test[-17600:]
sample['meter_reading'][-17600:] = list(model.predict(mini_batch))

In [0]:
sample.head()