In [1]:
import numpy as np
import pandas as pd
import os, gc, warnings
import random
import datetime
import warnings

from tqdm.notebook import tqdm

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn
import category_encoders

import missingno as msno

import lightgbm as lgb

import pickle

warnings.filterwarnings('ignore')

In [2]:
path = 'E:/pasca/Documents/Test/'
#show datapath
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

E:/pasca/Documents/Test/building_metadata.csv
E:/pasca/Documents/Test/sample_submission.csv
E:/pasca/Documents/Test/test.csv
E:/pasca/Documents/Test/train.csv
E:/pasca/Documents/Test/weather_test.csv
E:/pasca/Documents/Test/weather_train.csv


In [3]:
#data import
train_df = pd.read_csv(path + 'train.csv')
building_df = pd.read_csv(path + 'building_metadata.csv')
weather_df = pd.read_csv(path + 'weather_train.csv')

In [4]:
def missing_statistics(df):
    #show missing values
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

In [5]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude

def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week'],axis=1)
        
    return weather_df

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]
    df["is_holiday"] = (df.timestamp.isin(holidays)).astype(int)
    df['square_feet'] =  np.log1p(df['square_feet']**0.5)
    
    # Remove Unused Columns
    #drop = ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count"]
    #df = df.drop(drop, axis=1)
    #gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [6]:
weather_df = fill_weather_dataset(weather_df)

In [7]:
train_df = reduce_mem_usage(train_df,use_float16=True)
building_df = reduce_mem_usage(building_df,use_float16=True)
weather_df = reduce_mem_usage(weather_df,use_float16=True)

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.84 MB
Decreased by 71.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.9%
Memory usage of dataframe is 10.72 MB
Memory usage after optimization is: 2.73 MB
Decreased by 74.5%


In [8]:
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])

In [9]:
train_df = features_engineering(train_df)

In [10]:
train_df.drop(train_df.loc[train_df['building_id']== 1099].index, inplace=True)
train_df.drop(train_df.loc[train_df['building_id']== 778].index, inplace=True)

In [11]:
del train_df["year_built"]
del train_df["floor_count"]

In [12]:
mask = train_df["meter"] == 0
train_df_0 = train_df[mask]

mask = train_df["meter"] == 1
train_df_1 = train_df[mask]

mask = train_df["meter"] == 2
train_df_2 = train_df[mask]

mask = train_df["meter"] == 3
train_df_3 = train_df[mask]

Model:

In [13]:
x = train_df_0["air_temperature"].values.astype(float)
x = x.reshape((-1, 1))
#y = np.log1p(train_df_0["meter_reading"])
y = train_df_0["meter_reading"]

In [14]:
#target = train_df_0["meter_reading"]
target = np.log1p(train_df_0["meter_reading"])
train = train_df_0.drop(["building_id", 'meter', 'meter_reading', 'timestamp','hour', "weekend", "is_holiday"], axis = 1)
train.head()

Unnamed: 0,site_id,primary_use,square_feet,month,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,4.468308,1,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0
1,0,0,3.973186,1,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0
2,0,0,4.308396,1,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0
3,0,0,5.042775,1,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0
4,0,0,5.836206,1,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0


In [15]:
#categorical_features = ["building_id", "site_id", "primary_use"]
#ce = category_encoders.CountEncoder(cols=categorical_features)
#ce.fit(train)
#train = ce.transform(train)
#train.sample()
#train.dtypes

train_1 = pd.get_dummies(train, columns = ["site_id", "primary_use",'month'])

In [16]:
#N_train = train.shape[0]
#for feature in categorical_features:
    #train[feature] = train[feature]/N_train
    
train_1.head()

Unnamed: 0,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,site_id_0,site_id_1,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,4.468308,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3.973186,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.308396,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,5.042775,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5.836206,25.0,6.0,20.0,-0.17395,1019.5,0.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
model = LinearRegression().fit(train_1, target)

In [18]:
r_sq = model.score(train_1, target)

In [19]:
print(r_sq)

0.4753870674960864


In [20]:
output = "{:.7f}".format(r_sq)
print(output)

0.4753871


In [21]:
print(model.intercept_)

111777398.12684114


In [22]:
print(model.coef_)

[ 1.68447024e+00 -7.09486718e-03  5.40549966e-04 -9.64376995e-04
 -1.72094914e-04 -3.73314574e-03 -4.13765064e-04  6.28261904e-03
 -4.34521358e+07 -4.34521350e+07 -4.34521352e+07 -4.34521352e+07
 -4.34521355e+07 -4.34521365e+07 -4.34521351e+07 -4.34521346e+07
 -4.34521352e+07 -4.34521355e+07 -4.34521355e+07 -4.34521348e+07
 -4.34521356e+07 -4.34521353e+07 -4.34521352e+07 -4.34521355e+07
 -7.05675519e+07 -7.05675519e+07 -7.05675516e+07 -7.05675516e+07
 -7.05675522e+07 -7.05675513e+07 -7.05675520e+07 -7.05675522e+07
 -7.05675536e+07 -7.05675519e+07 -7.05675529e+07 -7.05675519e+07
 -7.05675527e+07 -7.05675513e+07 -7.05675508e+07 -7.05675524e+07
  2.24228777e+06  2.24228781e+06  2.24228779e+06  2.24228783e+06
  2.24228797e+06  2.24228831e+06  2.24228832e+06  2.24228839e+06
  2.24228838e+06  2.24228818e+06  2.24228824e+06  2.24228817e+06]
