In [1]:
import math, datetime, time, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


def load_data(data_path):
  data = pd.read_csv(data_path)  
  return data

inbound = load_data("inbound_loads.csv")
outbound = load_data("outbound_laods.csv")
weather = load_data("weather.csv")
door = load_data("feature_inbound_outbound_door_open.csv")

#For loop to ensure that all pallet data is in the same dataframe
pallet = load_data("Pallet_history_Gold_Spike[0].csv")
for x in range(1, 10):
    pallet = pd.concat([pallet, load_data(f"Pallet_history_Gold_Spike[{x}].csv")])
trainentest = load_data("demand_kWtrain_val.csv")
train = trainentest.iloc[:273988,:]
test = trainentest.iloc[273988:, :]

## Plan de campagne:

- Process the weather data into workable data for the algo
- Calculate on a minute-to-minute basis how many products are 'new' in the warehouse.
- Try to process the inbound and outbound data to make the amounts of incoming and outgoing products available
- Try to estimate on a minute-to-minute basis how many doors are open at any point in time.
    - This should be done on a percentage-based scale (how much percent of the minute was a door open)

In [2]:
# for data visualization
import math, datetime, time, random
import matplotlib.pyplot as plt
import missingno

### Concatting door open from inbound and outbound

In [None]:
#door_outbound = door_outbound.rename(columns={"counts":"count"})
#base_door = pd.concat([door_inbound, door_outbound['count']], axis=1)
#base_door.set_index('datetime_local', inplace=True)
#total = base_door['counts'] + base_door['count']  
#base_door = base_door.assign(total=total)
#base_door = base_door.drop(['counts', 'count'], axis=1)


In [None]:
#base_door = base_door.reset_index(level=[0])
#base_door['datetime_local'] = pd.to_datetime(base_door['datetime_local'])
#base_door['datetime_local'] = pd.Series(test.datetime_local.dt.to_pydatetime(), dtype='O')
#type(base_door['datetime_local'][0])
#type(base_door.datetime_local.iloc[3])

In [3]:
def addtimecol(df, colname): ####input df and colname 
    df[colname] = pd.to_datetime(df[colname])         
    df['year'] = df[colname].dt.year
    df['month'] = df[colname].dt.month
    df['weekday'] = df[colname].dt.weekday
    df['day'] = df[colname].dt.day
    df['hour'] = df[colname].dt.hour
    df['minute'] = df[colname].dt.minute        
    return df

#Create new dummy dfs
base_df = train.copy()
base_weather = weather.copy()
base_door = door.copy()
#base_outbound_door = door_outbound.copy()

#Remove unnecessary columns
base_df = base_df.drop('Unnamed: 0', axis=1)
base_weather = base_weather.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
base_weather["localstrptime"]= pd.to_datetime(base_weather["localstrptime"])
base_df['datetime_local'] = pd.to_datetime(base_df['datetime_local'])
base_door['datetime_local'] = pd.to_datetime(base_door['datetime_local'])
base_weather = base_weather.rename(columns={'localstrptime':'datetime_local'})
#base_weather['datetime_local'] = [datetime.datetime.strptime(x, 'yyyy/MM/dd HH:mm:SS') for x in base_weather['localstrptime']]

#Add time columns.
addtimecol(base_df, 'datetime_local')

#Set index to datetime
base_df.set_index('datetime_local', inplace=True)
base_weather.set_index('datetime_local', inplace=True)
base_door.set_index('datetime_local', inplace=True)

#Concatenate the weather DataFrame to the base DataFrame
base_df = pd.concat([base_df, base_weather], axis=1)
base_df = pd.concat([base_df, base_door], axis=1)


# Drop all NaN values\n"
base_df.dropna(subset=['demand_kW', 'Temperature'])

base_df

Unnamed: 0_level_0,demand_kW,year,month,weekday,day,hour,minute,datetime,Relative Humidity,Temperature,datetime_UTC,hour,total
datetime_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-12-31 18:00:00,,,,,,,,2018-12-31 18:00:00-06:00,50.37,53.6,2019-01-01 00:00:00,18.0,
2018-12-31 18:05:00,,,,,,,,2018-12-31 18:05:00-06:00,50.37,53.6,2019-01-01 00:05:00,18.0,
2018-12-31 18:10:00,,,,,,,,2018-12-31 18:10:00-06:00,50.37,53.6,2019-01-01 00:10:00,18.0,
2018-12-31 18:15:00,,,,,,,,2018-12-31 18:15:00-06:00,50.37,53.6,2019-01-01 00:15:00,18.0,
2018-12-31 18:20:00,,,,,,,,2018-12-31 18:20:00-06:00,50.37,53.6,2019-01-01 00:20:00,18.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-04 03:53:00,,,,,,,,,,,,,3.0
2022-01-04 03:54:00,,,,,,,,,,,,,3.0
2022-01-04 03:55:00,,,,,,,,,,,,,3.0
2022-01-04 03:56:00,,,,,,,,,,,,,2.0


In [None]:
#base_door = base_door.reset_index(level=[0])
base_door['datetime_local'] = pd.to_datetime(base_door['datetime_local'])
#base_door['datetime_local'] = pd.Series(test.datetime_local.dt.to_pydatetime(), dtype='O')
#type(base_door['datetime_local'][0])
type(base_door.datetime_local.iloc[3])

In [None]:
dummy_df = base_df.dropna(subset=['demand_kW', 'Temperature', 'Relative Humidity'])
dummy_df = dummy_df.reset_index()
dummy_df = dummy_df.drop(['hour'], axis=1)
dummy_df = dummy_df.drop(['datetime'], axis=1)
dummy_df = dummy_df.drop(['datetime_local'], axis=1)
dummy_df = dummy_df.drop(['datetime_UTC'], axis=1)
dummy_df


In [None]:
dummy_df.to_csv

# Model Building

First we build the skeleton:
- Divide into train/test
- Set target column
- Get Accuracy

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


#Define train, test sets\n",
train, test = train_test_split(dummy_df)
X_train = train.copy().drop(['demand_kW'], axis=1)
Y_train = train['demand_kW']
X_test = test.copy().drop(['demand_kW'], axis=1)
Y_test = test['demand_kW']

## Support Vector Regression

In [None]:
svm = SVR()
svm.fit(X_train, Y_train)

In [None]:
acc = svm.predict(X_test)
result = r2_score(list(Y_test), acc)
result

In [None]:
mse_svr = mean_squared_error(list(Y_test), acc)
mae_svr = mean_absolute_error(list(Y_test), acc)

print("mse", mse_svr)
print("mae", mae_svr)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 200)
rf.fit(X_train, Y_train)



In [None]:
acc_rf = rf.predict(X_test)

result = r2_score(list(Y_test), acc_rf)
result

Random Forest - 100 trees = 0.9464962000675328
Random Forest - 200 trees = 0.9471820046457805



In [None]:
mse = mean_squared_error(list(Y_test), acc_rf)
mae = mean_absolute_error(list(Y_test), acc_rf)

print("mse", mse)
print("mae", mae)

In [None]:
plt.plot(list(Y_test))
plt.plot(acc_rf)




In [None]:
diff = abs(list(Y_test) - acc_rf)
diff

y_axis = range(0,len(Y_test))
y_axis

plt.figure(figsize=(20,20))

plt.scatter(y_axis,diff)
plt.show()



In [None]:
plt.figure(figsize=(20,20))

plt.scatter(y_axis,Y_test)
plt.scatter(y_axis,acc_rf)

plt.show()


In [None]:
Y_test