In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/ML Project')

In [0]:
import numpy as np
import pandas as pd
import gc

In [0]:
# Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [16]:
building_df = pd.read_csv("ashrae-energy-prediction/building_metadata.csv")
weather_df =  pd.read_csv("ashrae-energy-prediction/weather_train.csv")
train = pd.read_csv("ashrae-energy-prediction/train.csv")

train = train.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather_df, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])

train=reduce_mem_usage(train)
del weather_df

gc.collect()

Mem. usage decreased to 1036.44 Mb (60.3% reduction)


386

In [0]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["weekday"] = train["timestamp"].dt.weekday
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train['weekday'].astype(np.uint8)
train["hour"] = train['hour'].astype(np.uint8)
train['year_built'] = train['year_built']-1900
train['square_feet'] = np.log(train['square_feet'])

In [0]:
def average_imputation(df, column_name):
    imputation = df.groupby(['timestamp'])[column_name].mean()
    df.loc[df[column_name].isnull(), column_name]= df[df[column_name].isnull()][[column_name]].apply(lambda x: imputation[df['timestamp'][x.index]].values)
    del imputation
    return df

In [0]:
train = average_imputation(train, 'wind_speed')
train = average_imputation(train, 'wind_direction')


beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
          (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

for item in beaufort:
    train.loc[(train['wind_speed']>=item[1]) & (train['wind_speed']<item[2]), 'beaufort_scale'] = item[0]


In [0]:
del train["timestamp"]

In [0]:
def degToCompass(num):
    val=int((num/22.5)+.5)
    arr=[i for i in range(0,16)]
    return arr[(val % 16)]

In [0]:
train['wind_direction'] = train['wind_direction'].apply(degToCompass)
train['beaufort_scale'] = train['beaufort_scale'].astype(np.uint8)
train["wind_direction"] = train['wind_direction'].astype(np.uint8)
train["meter"] = train['meter'].astype(np.uint8)
train["site_id"] = train['site_id'].astype(np.uint8)
train['year_built'] = train['year_built']-1900
train['square_feet'] = np.log(train['square_feet'])

In [0]:
drop_cols = ["sea_level_pressure", "wind_speed"]
train = train.drop(drop_cols, axis = 1)

In [0]:
#filling out missing values
#calculating mean
floor_count_median=train.floor_count.median(axis=0,skipna=True)
year_built_median=train.year_built.median(axis=0,skipna=True)
#Filling the missing value in building to Median as we have outlier
air_temperature_median=train.air_temperature.median(axis=0,skipna=True)
cloud_coverage_median=train.cloud_coverage.median(axis=0,skipna=True)
dew_temperature_median=train.dew_temperature.median(axis=0,skipna=True)
precip_depth_1_hr_median=train.precip_depth_1_hr.median(axis=0,skipna=True)

#filling values
train.floor_count.fillna(floor_count_median,inplace=True)
train.year_built.fillna(year_built_median,inplace=True)
#Filling the missing value in building to Median as we have outlier
train.air_temperature.fillna(air_temperature_median,inplace=True)
train.cloud_coverage.fillna(cloud_coverage_median,inplace=True)
train.dew_temperature.fillna(dew_temperature_median,inplace=True)
train.precip_depth_1_hr.fillna(precip_depth_1_hr_median,inplace=True)



In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

le = LabelEncoder()
le = le.fit(train["primary_use"])
train["primary_use"] = le.transform(train["primary_use"])


In [0]:
categoricals = ["site_id", "building_id", "primary_use", "hour", "weekday", "meter",  "wind_direction"]

In [0]:
numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count', 'beaufort_scale']

feat_cols = categoricals + numericals

In [0]:
#target = np.log1p(train["meter_reading"])
target = train["meter_reading"]
del train["meter_reading"] 

In [0]:
#divide training and test data
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(train,target,test_size=0.30,random_state=42)

In [0]:
#Compute the RMSLE
def RMSLE(pred,act): 
    return np.sqrt(np.sum((np.log(pred+1)-np.log(act+1))**2)/len(act))

In [0]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=30,n_estimators = 100)
model.fit(Xtrain,Ytrain)



In [0]:
import pickle
s = pickle.dumps(model)

In [0]:
#Training Predictions
y_predictTrain = model.predict(Xtrain)

In [0]:
#Testing Predictions
y_predictTest = model.predict(Xtest)

In [0]:
#Evaluating on training Data
res=RMSLE(y_predictTrain,Ytrain)
res

0.39485883575233166

In [0]:
#Evaluating on testing Data
res = RMSLE(y_predictTest,Ytest)
res

0.7858377344799493

In [0]:
#test
test = pd.read_csv("ashrae-energy-prediction/test.csv")
weather_test = pd.read_csv("ashrae-energy-prediction/weather_test.csv")

test = test.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
test=reduce_mem_usage(test)
del weather_test
del building_df
gc.collect()

In [0]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["weekday"] = test["timestamp"].dt.weekday
test["hour"] = test["timestamp"].dt.hour
test["weekday"] = test['weekday'].astype(np.uint8)
test["hour"] = test['hour'].astype(np.uint8)
test['year_built'] = test['year_built']-1900
test['square_feet'] = np.log(test['square_feet'])

In [0]:
test = average_imputation(test, 'wind_speed')
test = average_imputation(test, 'wind_direction')

beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
          (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

for item in beaufort:
    test.loc[(test['wind_speed']>=item[1]) & (test['wind_speed']<item[2]), 'beaufort_scale'] = item[0]
del test["timestamp"]

In [0]:
test['wind_direction'] = test['wind_direction'].apply(degToCompass)
test['beaufort_scale'] = test['beaufort_scale'].astype(np.uint8)
test["wind_direction"] = test['wind_direction'].astype(np.uint8)
test["meter"] = test['meter'].astype(np.uint8)
test["site_id"] = test['site_id'].astype(np.uint8)
test = test.drop(drop_cols, axis = 1)


test["primary_use"] = le.transform(test["primary_use"])

In [0]:
#filling values
test.floor_count.fillna(floor_count_median,inplace=True)
test.year_built.fillna(year_built_median,inplace=True)
#Filling the missing value in building to Median as we have outlier
test.air_temperature.fillna(air_temperature_median,inplace=True)
test.cloud_coverage.fillna(cloud_coverage_median,inplace=True)
test.dew_temperature.fillna(dew_temperature_median,inplace=True)
test.precip_depth_1_hr.fillna(precip_depth_1_hr_median,inplace=True)

In [0]:
org_test = test.copy()

In [0]:
test.drop(['row_id'],axis=1,inplace=True)

In [0]:
predictions = model.predict(test)

In [0]:
#np.savetxt("predictions",predictions,delimiter=',')
submission = pd.DataFrame({'row_id':org_test['row_id'],'meter_reading':predictions})
submission.to_csv("submission",index=False)