In [2]:
import math, datetime, time, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import itertools
import tqdm


def load_data(data_path):
  data = pd.read_csv(data_path)  
  return data

inbound = load_data("inbound_loads.csv")
outbound = load_data("outbound_laods.csv")
weather = load_data("weather.csv")
#For loop to ensure that all pallet data is in the same dataframe
pallet = load_data("Pallet_history_Gold_Spike[0].csv")
for x in range(1, 10):
    pallet = pd.concat([pallet, load_data(f"Pallet_history_Gold_Spike[{x}].csv")])
pallet = pallet.drop(['lot_code', 
                      'tran_type', 
                      'final_pallet_code', 
                      'warehouse_facility_id',
                      'source_system_id'], axis=1)
trainentest = load_data("demand_kWtrain_val.csv")
train = trainentest.iloc[:273988,:]
test = trainentest.iloc[273988:, :]

In [3]:
def weekend(x):
    if x['weekday'] > 4:
        return True
    return False
    

def addtimecol(df, colname): ####input df and colname 
    df[colname] = pd.to_datetime(df[colname])         
    df['year'] = df[colname].dt.year
    df['month'] = df[colname].dt.month
    df['weekday'] = df[colname].dt.weekday
    df['weekend'] = df.apply(weekend, axis=1)
    df['day'] = df[colname].dt.day
    df['hour'] = df[colname].dt.hour
    df['minute'] = df[colname].dt.minute        
    return df


#Create new dummy dfs
base_df = train.copy()
base_weather = weather.copy()


#Remove unnecessary columns
base_df = base_df.drop('Unnamed: 0', axis=1)
base_weather = base_weather.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
base_weather["localstrptime"]= pd.to_datetime(base_weather["localstrptime"])
base_df['datetime_local'] = pd.to_datetime(base_df['datetime_local'])
base_weather = base_weather.rename(columns={'localstrptime':'datetime_local'})

#Add time columns.
addtimecol(base_df, 'datetime_local')

#Set index to datetime
base_df.set_index('datetime_local', inplace=True)
base_weather.set_index('datetime_local', inplace=True)

#Concatenate the weather DataFrame to the base DataFrame
base_df = pd.concat([base_df, base_weather], axis=1)

# Drop all NaN values
#base_df.dropna(subset=['demand_kW', 'Temperature'])

In [24]:
dummy_df = base_df.copy()

dummy_df = dummy_df.reset_index()
dummy_df = dummy_df.drop(['hour'], axis=1)
dummy_df = dummy_df.drop(['datetime'], axis=1)
dummy_df = dummy_df.drop(['datetime_local'], axis=1)
dummy_df = dummy_df.drop(['datetime_UTC'], axis=1)
dummy_normalized_df = dummy_df.copy()
#Still drop demand_kW NaNs
dummy_normalized_df = dummy_normalized_df.dropna(subset=['demand_kW'], axis=0)

### Dataframe Column preprocessing functions
- Interpolate columns with missing values
- Normalizing columns (Min-max Normalization)
- One-hot encoding categorical columns

Quite important to do it in that order.

In [5]:
def normalize_column(df, columnname):
    """Function which returns a Dataframe where the given column is normalized through min-max normalization."""
    df[f'{columnname}_normalized'] = (df[columnname] - df[columnname].min()) / (df[columnname].max() - df[columnname].min())
    return df.drop([columnname], axis=1)

def add_one_hot_encoder(df, colname):
    """
    Function which returns a DataFrame where the given column has been removed and replaced by
    one-hot-encoding columns for each value in the original column.
    """
    onehot = pd.get_dummies(df[colname], prefix=colname)
    return df.drop(colname, axis=1).join(onehot)

def interpolate_column(df, colname):
    df[f'{colname}_interpolated'] = df[colname].interpolate(method='linear')
    return df.drop([colname], axis=1)

In [6]:
interpolate = ['Temperature', 'Relative Humidity']
to_normalize = ['Relative Humidity_interpolated', 'Temperature_interpolated']
add_one_hot_encoding = ['weekday', 'year', 'month']


for x in interpolate:
    dummy_normalized_df = interpolate_column(dummy_normalized_df, x)
for x in to_normalize:
    dummy_normalized_df = normalize_column(dummy_normalized_df, x)
for x in add_one_hot_encoding:
    dummy_normalized_df = add_one_hot_encoder(dummy_normalized_df, x)

# Model Building

First we build the skeleton:
- Divide into train/test
- Set target column
- Get Accuracy

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor



#Define train, test sets\n",
train, test = train_test_split(dummy_normalized_df, shuffle=True)
X_train = train.copy().drop(['demand_kW'], axis=1)
Y_train = train['demand_kW']
X_test = test.copy().drop(['demand_kW'], axis=1)
Y_test = test['demand_kW']

def test_model(model, data, debug=False):
    xtrain, ytrain, xtest, ytest = data
    if debug:
        print("Fitting model...")
    model.fit(xtrain, ytrain)
    if debug:
        print('Predicting...')
    acc = model.predict(xtest)
    if debug:
        print('Calculating mean absolute error...')
    return mean_absolute_error(list(ytest), acc)

In [8]:
# # Random Forest Regression
# rfr = RandomForestRegressor(
#     verbose=1,
#     n_estimators=100
#     )
# rfr.fit(X_train, Y_train)
# rfr_acc = rfr.predict(X_test)
# print(r2_score(list(Y_test), rfr_acc))
# print(mean_absolute_error(list(Y_test), rfr_acc))

In [9]:
# from sklearn.neighbors import KNeighborsRegressor

# knn = KNeighborsRegressor(n_neighbors=4)
# knn.fit(X_train, Y_train)
# knn_acc = knn.predict(X_test)
# print(r2_score(list(Y_test), knn_acc))
# print(mean_absolute_error(list(Y_test), knn_acc))

In [10]:
# featureImportance = rfr.feature_importances_
# for x in range(len(featureImportance)):
#     print(f'{dummy_normalized_df.columns[1:][x]} has a feature importance of: {featureImportance[x]}.')

# From here, model testing. Delete at your own discretion.

### -Failed- /// SUCCESSFULL attempt to automate parameter search!
Add all parameters you want to check into the dict below to see what the best combination is. 

In [20]:
parameter_settings = {
    'n_estimators': [10, 20, 50],
    'criterion': ['squared_error', 'friedman_mse', 'poisson']#,
    #'max_depth': [2, 4, 6],
    #'min_samples_split': [2, 4, 8]
}
data = [X_train, Y_train, X_test, Y_test]
model = RandomForestRegressor

100.47547995272093


In [22]:
def model_optimizer(data, model, params):
    parameter_combinations = itertools.product(*params.values())
    results = {}
    for parameters in parameter_combinations:
        start = time.time()
        params_dict = dict(zip(params.keys(), parameters))
        print('Testing model with parameters: ' + str(params_dict))
        current_model = model(**params_dict)
        mae = test_model(current_model, data, debug = False)
        print(f'Mean Absolute Error = {mae}')
        modelname = str(params_dict)
        end = time.time()
        print(f'Time spent: {end-start} seconds. \n')
        results[modelname] = [mae, (end-start)]
    return results
        
test = rf_optimizer(data, RandomForestRegressor, parameter_settings)

Testing model with parameters: {'n_estimators': 10, 'criterion': 'squared_error'}
Fitting model...
Predicting...
Calculating mean absolute error...
Mean Absolute Error = 102.86142692044685
Time spent: 15.495572566986084 seconds. 

Testing model with parameters: {'n_estimators': 10, 'criterion': 'friedman_mse'}
Fitting model...
Predicting...
Calculating mean absolute error...
Mean Absolute Error = 102.56604595680301
Time spent: 15.941078424453735 seconds. 

Testing model with parameters: {'n_estimators': 10, 'criterion': 'poisson'}
Fitting model...
Predicting...
Calculating mean absolute error...
Mean Absolute Error = 102.95391555404576
Time spent: 18.838756799697876 seconds. 

Testing model with parameters: {'n_estimators': 20, 'criterion': 'squared_error'}
Fitting model...
Predicting...
Calculating mean absolute error...
Mean Absolute Error = 101.51565639709797
Time spent: 27.55039882659912 seconds. 

Testing model with parameters: {'n_estimators': 20, 'criterion': 'friedman_mse'}
Fit

In [26]:
for setting in test.keys():
    print(f'Setting: {setting}. \nMAE: {test[setting][0]}. \nTime Spent: {test[setting][1]}.\n')

Setting: {'n_estimators': 10, 'criterion': 'squared_error'}. 
MAE: 102.86142692044685. 
Time Spent: 15.495572566986084.

Setting: {'n_estimators': 10, 'criterion': 'friedman_mse'}. 
MAE: 102.56604595680301. 
Time Spent: 15.941078424453735.

Setting: {'n_estimators': 10, 'criterion': 'poisson'}. 
MAE: 102.95391555404576. 
Time Spent: 18.838756799697876.

Setting: {'n_estimators': 20, 'criterion': 'squared_error'}. 
MAE: 101.51565639709797. 
Time Spent: 27.55039882659912.

Setting: {'n_estimators': 20, 'criterion': 'friedman_mse'}. 
MAE: 101.46939692994685. 
Time Spent: 31.56662893295288.

Setting: {'n_estimators': 20, 'criterion': 'poisson'}. 
MAE: 101.7119281639997. 
Time Spent: 35.3441481590271.

Setting: {'n_estimators': 50, 'criterion': 'squared_error'}. 
MAE: 100.64288624669018. 
Time Spent: 74.1656174659729.

Setting: {'n_estimators': 50, 'criterion': 'friedman_mse'}. 
MAE: 100.75376390801475. 
Time Spent: 69.79285168647766.

Setting: {'n_estimators': 50, 'criterion': 'poisson'}. 