In [1]:
# Import dependencies

# data analysis and wrangling

import pandas as pd
import numpy as np
import math
import os
from metpy import calc
from datetime import datetime, timedelta
from statistics import mean

# visualization

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning

In [2]:
# Set pandas options

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# Read data files

data_files = ['train.csv', 'test.csv', 'weather_train.csv', 'weather_test.csv', 'building_metadata.csv']
raw_df = {}

for file in data_files:
    raw_df[file[:-4]] = pd.read_csv(os.path.join(os.getcwd(), 'data', file)

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_2852/2249592806.py, line 7)

# Classifying
We may want to classify or categorize our samples. We may also want to understand the implications or correlation of different classes with our solution goal.

In [None]:
# Function to examine and return missing data from a dataframe as a percentage

def missingdata(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    ms= ms[ms["Percent"] > 0]

    return ms

In [None]:
# Preview the data

# Train & Test data

train_df = raw_df['train'].copy()
test_df = raw_df['test'].copy()

train_df.info(verbose=True, show_counts=True)
print('-'*40)
test_df.info(verbose=True, show_counts=True)

In [None]:
train_df

In [None]:
missingdata(train_df)

In [None]:
missingdata(test_df)

In [None]:
# Weather train & Weather test data

weather_train_df = raw_df['weather_train'].copy()
weather_test_df = raw_df['weather_test'].copy()

weather_train_df.info(verbose=True, show_counts=True)
print('-'*40)
weather_test_df.info(verbose=True, show_counts=True)

In [None]:
weather_train_df

In [None]:
missingdata(weather_train_df)

In [None]:
missingdata(weather_test_df)

In [None]:
# Building metadata

building_df = raw_df['building_metadata'].copy()

building_df.info(verbose=True, show_counts=True)

In [None]:
building_df

In [None]:
missingdata(building_df)

# Charting
How to select the right visualization plots and charts depending on nature of the data and the solution goals.

# Correcting
We may also analyze the given training dataset for errors or possibly inaccurate values within features and try to correct these values or exclude the samples containing the errors. One way to do this is to detect any outliers among our samples or features. We may also completely discard a feature if it is not contributing to the analysis or may significantly skew the results.

In [None]:
# Correcting by dropping features with a lot of missing data

weather_drop = ['cloud_coverage', 'precip_depth_1_hr', 'sea_level_pressure']
building_drop = ['floor_count', 'year_built']

weather_train_df.drop(weather_drop, axis=1, inplace=True)
weather_test_df.drop(weather_drop, axis=1, inplace=True)
building_df.drop(building_drop, axis=1, inplace=True)

# Completing
Data preparation may also require us to estimate any missing values within a feature. Model algorithms may work best when there are no missing values.

In [None]:
# Function to estimate weather null values based on the averages of 
# the previous and post known values

def estimate_weather(df, col):
    null_dex = list(df.loc[df[col].isna()]['timestamp'].index)
    for dex in null_dex:
        if math.isnan(df[col][dex+1]):
            df[col][dex] = df[col][dex-1].copy()
        else:
            df[col][dex] = round(mean([df[col][dex-1],df[col][dex+1]]), 1).copy()
    return df[col]

In [None]:
# Estimating the values for weather data using the average from the previous & following hour

weather_combined = [weather_train_df, weather_test_df]
col_to_complete = ['wind_direction', 'wind_speed', 'dew_temperature', 'air_temperature']

for w_df in weather_combined:
    for c in col_to_complete:
        w_df[c] = estimate_weather(w_df, c)

# Creating
Can we create new features based on an existing feature or a set of features, such that the new feature follows the correlation, conversion, completeness goals.

In [None]:
# Creating new humidity feature calculated from air & dew temperature

# Calculation formula source: https://www.weather.gov/media/epz/wxcalc/vaporPressure.pdf

weather_combined = [weather_train_df, weather_test_df]

for w_df in weather_combined:
    rh_list = []
    for i in range(0,len(w_df)):
        e_s = 6.11 * 10 * ((7.5 * w_df['air_temperature'][i])/(237.3 + w_df['air_temperature'][i]))
        e = 6.11 * 10 * ((7.5 * w_df['dew_temperature'][i])/(237.3 + w_df['dew_temperature'][i]))

        rh_list.append(round(e / e_s * 100, 1))
    w_df['relative_humidity'] = rh_list

In [None]:
metpy.calc.relative_humidity_from_dewpoint(['18'], ['15'])

In [None]:
weather_combined = [weather_train_df, weather_test_df]

for w_df in weather_combined:
    rh = metpy.calc.relative_humidity_from_dewpoint(str(w_df['air_temperature'][0]), str(w_df['dew_temperature'][0]))

# Converting
For modeling stage, one needs to prepare the data. Depending on the choice of model algorithm one may require all features to be converted to numerical equivalent values. So for instance converting text categorical values to numeric values.

In [None]:
# Converting timestamp from object to datetime

combined = [train_df, test_df, weather_train_df, weather_test_df]

for df in combined:
    df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format=True)

In [None]:
# Mapping primary_use category in building metadata

primary_use_map = {'Education': 1, 'Office': 2, 'Entertainment/public assembly': 3, 'Lodging/residential': 4,
                   'Public services': 5, 'Healthcare': 6, 'Other': 7, 'Parking': 8, 'Manufacturing/industrial': 9,
                   'Food sales and service': 10, 'Retail': 11, 'Warehouse/storage': 12, 'Services': 13, 
                   'Technology/science': 14, 'Utility': 15, 'Religious worship': 16}

building_df['primary_use'] = building_df['primary_use'].map(primary_use_map).astype(np.int64)

# Correlating
One can approach the problem based on available features within the training dataset. Which features within the dataset contribute significantly to our solution goal? Statistically speaking is there a correlation among a feature and solution goal? As the feature values change does the solution state change as well, and visa-versa? This can be tested both for numerical and categorical features in the given dataset. We may also want to determine correlation among features other than survival for subsequent goals and workflow stages. Correlating certain features may help in creating, completing, or correcting features.

In [None]:
# Merge dataframes

train_merged = train_df.merge(building_df, on='building_id', how='left')
train_merged = train_merged.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')

test_merged = test_df.merge(building_df, on='building_id', how='left')
test_merged = test_merged.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')

# Model, predict and solve
Now we are ready to train a model and predict the required solution. There are 60+ predictive modelling algorithms to choose from. We must understand the type of problem and solution requirement to narrow down to a select few models which we can evaluate. Our problem is a classification and regression problem. We are also performing a category of machine learning which is called supervised learning as we are training our model with a given dataset. With these two criteria - Supervised Learning plus Classification and Regression, we can narrow down our choice of models to a few.