### Import Dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

### Load Data

In [2]:
file = 'preprocessed_data.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Company Name,City,State,Zip,Shipping Service,Item/Bottle Count,Year,Month,Day
0,3,7981,9,33325,14,3.0,2020,1,1
1,3,5188,4,94549,14,10.0,2019,12,31
2,3,6939,34,10022,1,8.0,2020,1,1
3,3,6939,34,10006,1,6.0,2020,1,2
4,3,10666,20,21793,4,12.0,2019,12,31


### Split Data into Training and Test sets
- Option 1: Split data by taking random samples from each month of the year (ex: use train_test_split on January data, then February data, then March data, etc.) And then combine all the samples into training data representing all months, and test data representing all months.

- Option 2: Split data by using the 2019 and 2020 data for training and the 2021 data for testing.

***Note: RandomForestRegressor does not require data to be scaled***


In [5]:
# Option 1: split data by month
jan_df = df.loc[df['Month'] == 1]
feb_df = df.loc[df['Month'] == 2]
mar_df = df.loc[df['Month'] == 3]
apr_df = df.loc[df['Month'] == 4]
may_df = df.loc[df['Month'] == 5]
june_df = df.loc[df['Month'] == 6]
july_df = df.loc[df['Month'] == 7]
aug_df = df.loc[df['Month'] == 8]
sep_df = df.loc[df['Month'] == 9]
oct_df = df.loc[df['Month'] == 10]
nov_df = df.loc[df['Month'] == 11]
dec_df = df.loc[df['Month'] == 12]


In [6]:
# Option 1 (cont.): get random training data from each month

# January
X_jan = jan_df.drop(columns=['Item/Bottle Count'], axis=1)
y_jan = jan_df['Item/Bottle Count']
X_train_jan, X_test_jan, y_train_jan, y_test_jan = train_test_split(X_jan, y_jan, random_state=1)

# February
X_feb = feb_df.drop(columns=['Item/Bottle Count'], axis=1)
y_feb = feb_df['Item/Bottle Count']
X_train_feb, X_test_feb, y_train_feb, y_test_feb = train_test_split(X_feb, y_feb, random_state=1)

# March
X_mar = mar_df.drop(columns=['Item/Bottle Count'], axis=1)
y_mar = mar_df['Item/Bottle Count']
X_train_mar, X_test_mar, y_train_mar, y_test_mar = train_test_split(X_mar, y_mar, random_state=1)

# April
X_apr = apr_df.drop(columns=['Item/Bottle Count'], axis=1)
y_apr = apr_df['Item/Bottle Count']
X_train_apr, X_test_apr, y_train_apr, y_test_apr = train_test_split(X_apr, y_apr, random_state=1)

# May
X_may = may_df.drop(columns=['Item/Bottle Count'], axis=1)
y_may = may_df['Item/Bottle Count']
X_train_may, X_test_may, y_train_may, y_test_may = train_test_split(X_may, y_may, random_state=1)

# June
X_june = june_df.drop(columns=['Item/Bottle Count'], axis=1)
y_june = june_df['Item/Bottle Count']
X_train_june, X_test_june, y_train_june, y_test_june = train_test_split(X_june, y_june, random_state=1)

# July
X_july = july_df.drop(columns=['Item/Bottle Count'], axis=1)
y_july = july_df['Item/Bottle Count']
X_train_july, X_test_july, y_train_july, y_test_july = train_test_split(X_july, y_july, random_state=1)

# August
X_aug = aug_df.drop(columns=['Item/Bottle Count'], axis=1)
y_aug = aug_df['Item/Bottle Count']
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug, y_aug, random_state=1)

# September
X_sep = sep_df.drop(columns=['Item/Bottle Count'], axis=1)
y_sep = sep_df['Item/Bottle Count']
X_train_sep, X_test_sep, y_train_sep, y_test_sep = train_test_split(X_sep, y_sep, random_state=1)

# October
X_oct = oct_df.drop(columns=['Item/Bottle Count'], axis=1)
y_oct = oct_df['Item/Bottle Count']
X_train_oct, X_test_oct, y_train_oct, y_test_oct = train_test_split(X_oct, y_oct, random_state=1)

# November
X_nov = nov_df.drop(columns=['Item/Bottle Count'], axis=1)
y_nov = nov_df['Item/Bottle Count']
X_train_nov, X_test_nov, y_train_nov, y_test_nov = train_test_split(X_nov, y_nov, random_state=1)

# December
X_dec = dec_df.drop(columns=['Item/Bottle Count'], axis=1)
y_dec = dec_df['Item/Bottle Count']
X_train_dec, X_test_dec, y_train_dec, y_test_dec = train_test_split(X_dec, y_dec, random_state=1)


In [18]:
# Option 1 (cont.): combine training and test data from all months into X_train_all, X_test_all, y_train_all, y_test_all
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

X_train_all = pd.concat([X_train_jan,
    X_train_feb,
    X_train_mar,
    X_train_apr,
    X_train_may,
    X_train_june,
    X_train_july,
    X_train_aug,
    X_train_sep,
    X_train_oct,
    X_train_nov,
    X_train_dec])

X_test_all = pd.concat([X_test_jan,
    X_test_feb,
    X_test_mar,
    X_test_apr,
    X_test_may,
    X_test_june,
    X_test_july,
    X_test_aug,
    X_test_sep,
    X_test_oct,
    X_test_nov,
    X_test_dec])

y_train_all = pd.concat([y_train_jan,
    y_train_feb,
    y_train_mar,
    y_train_apr,
    y_train_may,
    y_train_june,
    y_train_july,
    y_train_aug,
    y_train_sep,
    y_train_oct,
    y_train_nov,
    y_train_dec])

y_test_all = pd.concat([y_test_jan,
    y_test_feb,
    y_test_mar,
    y_test_apr,
    y_test_may,
    y_test_june,
    y_test_july,
    y_test_aug,
    y_test_sep,
    y_test_oct,
    y_test_nov,
    y_test_dec])

print(X_train_all.shape, y_train_all.shape,
    X_test_all.shape, y_test_all.shape)

(176997, 6) (176997,) (59006, 6) (59006,)


In [11]:
# # CANNOT USE THIS OPTION UNLESS ALL DATA IS LOADED FROM DATABASE
# # --------------------------------------------------------------
# # Option 2: Train with 2019 and 2020, Test with 2021
# train_2019_2020 = df.loc[(df['Year'] == 2019) | (df['Year'] == 2020)]
# test_2021 = df.loc[df['Year'] == 2021]

# X_train_2019_2020 = train_2019_2020.drop(columns=['Item/Bottle Count'], axis=1)
# y_train_2019_2020 = train_2019_2020['Item/Bottle Count']

# X_test_2021 = test_2021.drop(columns=['Item/Bottle Count'], axis=1)
# y_test_2021 = test_2021['Item/Bottle Count']

### Build RandomForestRegressor Model

In [19]:
# Build model with different values for max_depth

regr_1 = RandomForestRegressor(max_depth=5)
regr_2 = RandomForestRegressor(max_depth=10)
regr_3 = RandomForestRegressor(max_depth=20)
regr_4 = RandomForestRegressor(max_depth=None)

regr_1.fit(X_train_all,y_train_all)
regr_2.fit(X_train_all,y_train_all)
regr_3.fit(X_train_all,y_train_all)
regr_4.fit(X_train_all,y_train_all)


# regr_5 = RandomForestRegressor(max_depth=5)
# regr_6 = RandomForestRegressor(max_depth=10)
# regr_7 = RandomForestRegressor(max_depth=20)
# regr_8 = RandomForestRegressor(max_depth=None)

# regr_5.fit(X_train_2019_2020,y_train_2019_2020)
# regr_6.fit(X_train_2019_2020,y_train_2019_2020)
# regr_7.fit(X_train_2019_2020,y_train_2019_2020)
# regr_8.fit(X_train_2019_2020,y_train_2019_2020)

# WILL TAKE ABOUT 6 MINUTES TO COMPLETE

In [20]:
y_pred1 = regr_1.predict(X_test_all)
y_pred2 = regr_2.predict(X_test_all)
y_pred3 = regr_3.predict(X_test_all)
y_pred4 = regr_4.predict(X_test_all)


# y_pred5 = regr_5.predict(X_test_2021)
# y_pred6 = regr_6.predict(X_test_2021)
# y_pred7 = regr_7.predict(X_test_2021)
# y_pred8 = regr_8.predict(X_test_2021)

### Evaluate the Model: How well does the model predict future outcomes?
- Mean Squared Error (MSE): mean or average of the squared differences between predicted and target values.
    - unit of measurement = squared 'Item/Bottle Count'
    - MSE = mean_squared_error(expected, predicted)
- Root Mean Squared Error (RMSE): square root of MSE.
    - unit of measurement = 'Item/Bottle Count'
    - RMSE = mean_squared_error(y_expected, y_predicted, squared=False)
- Mean Absolute Error (MAE): mean or average of the absolute error or difference between predicted and target values.
    - unit of measurement = 'Item/Bottle Count'
    - MAE = mean_absolute_error(y_expected, y_predicted)

In [21]:
# Scores for model with Option 1 Train/Test data

print("----------------------------------------")
print("Evaluate model")
print("----------------------------------------")
# Training Score
print("Training Score Max Depth=5: ", regr_1.score(X_train_all,y_train_all))
print("Training Score Max Depth=10: ", regr_2.score(X_train_all,y_train_all))
print("Training Score Max Depth=20: ", regr_3.score(X_train_all,y_train_all))
print("Training Score Max Depth=None: ", regr_4.score(X_train_all,y_train_all))

print("----------------------------------------")
# Test Score
print("Test Score Max Depth=5: ", regr_1.score(X_test_all, y_test_all))
print("Test Score Max Depth=10: ", regr_2.score(X_test_all, y_test_all))
print("Test Score Max Depth=20: ", regr_3.score(X_test_all, y_test_all))
print("Test Score Max Depth=None: ", regr_4.score(X_test_all, y_test_all))

print("----------------------------------------")
# Mean Squared Error
print("Mean Squared Error: ", mean_squared_error(y_test_all, y_pred1))
print("Mean Squared Error: ", mean_squared_error(y_test_all, y_pred2))
print("Mean Squared Error: ", mean_squared_error(y_test_all, y_pred3))
print("Mean Squared Error: ", mean_squared_error(y_test_all, y_pred4))

print("----------------------------------------")
# Root Mean Squared Error
print("Root Mean Squared Error: ", mean_squared_error(y_test_all, y_pred1, squared=False))
print("Root Mean Squared Error: ", mean_squared_error(y_test_all, y_pred2, squared=False))
print("Root Mean Squared Error: ", mean_squared_error(y_test_all, y_pred3, squared=False))
print("Root Mean Squared Error: ", mean_squared_error(y_test_all, y_pred4, squared=False))

print("----------------------------------------")
# Mean Absolute Error
print("Mean Absolute Error: ", mean_absolute_error(y_test_all, y_pred1))
print("Mean Absolute Error: ", mean_absolute_error(y_test_all, y_pred2))
print("Mean Absolute Error: ", mean_absolute_error(y_test_all, y_pred3))
print("Mean Absolute Error: ", mean_absolute_error(y_test_all, y_pred4))

----------------------------------------
Evaluate model
----------------------------------------
Training Score Max Depth=5:  0.6918098226796197
Training Score Max Depth=10:  0.7807273914132054
Training Score Max Depth=20:  0.8180447574116149
Training Score Max Depth=None:  0.8678585886574459
----------------------------------------
Test Score Max Depth=5:  0.08779231496079287
Test Score Max Depth=10:  0.03379880325854623
Test Score Max Depth=20:  0.019923696399207214
Test Score Max Depth=None:  0.023785957628622345
----------------------------------------
Mean Squared Error:  151.40763942532567
Mean Squared Error:  160.36944744908672
Mean Squared Error:  162.6724286789109
Mean Squared Error:  162.03137306714572
----------------------------------------
Root Mean Squared Error:  12.304781161212322
Root Mean Squared Error:  12.663705912926387
Root Mean Squared Error:  12.754310200042609
Root Mean Squared Error:  12.729154452167894
----------------------------------------
Mean Absolute Er

In [None]:
# # Scores for model with Option 2 Train/Test data

# print("----------------------------------------")
# print("Evaluate model")
# print("----------------------------------------")
# # Training Score
# print("Training Score Max Depth=5: ", regr_5.score(X_train_2019_2020,y_train_2019_2020))
# print("Training Score Max Depth=10: ", regr_6.score(X_train_2019_2020,y_train_2019_2020))
# print("Training Score Max Depth=20: ", regr_7.score(X_train_2019_2020,y_train_2019_2020))
# print("Training Score Max Depth=None: ", regr_8.score(X_train_2019_2020,y_train_2019_2020))

# print("----------------------------------------")
# # Test Score
# print("Test Score Max Depth=5: ", regr_5.score(X_test_2021, y_test_2021))
# print("Test Score Max Depth=10: ", regr_6.score(X_test_2021, y_test_2021))
# print("Test Score Max Depth=20: ", regr_7.score(X_test_2021, y_test_2021))
# print("Test Score Max Depth=None: ", regr_8.score(X_test_2021, y_test_2021))

# print("----------------------------------------")
# # Mean Squared Error
# print("Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred5))
# print("Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred6))
# print("Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred7))
# print("Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred8))

# print("----------------------------------------")
# # Root Mean Squared Error
# print("Root Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred5, squared=False))
# print("Root Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred6, squared=False))
# print("Root Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred7, squared=False))
# print("Root Mean Squared Error: ", mean_squared_error(y_test_2021, y_pred8, squared=False))

# print("----------------------------------------")
# # Mean Absolute Error
# print("Mean Absolute Error: ", mean_absolute_error(y_test_2021, y_pred5))
# print("Mean Absolute Error: ", mean_absolute_error(y_test_2021, y_pred6))
# print("Mean Absolute Error: ", mean_absolute_error(y_test_2021, y_pred7))
# print("Mean Absolute Error: ", mean_absolute_error(y_test_2021, y_pred8))

In [12]:
# # Plot 'Item/Bottle Count' with 'Month'

# # True Test Dataset
# plt.scatter(X_test_all['Month'].values, y_test_all, color='black')

# # Max Depth = None
# plt.scatter(X_test_all['Month'].values, y_pred4)
