### Import Dependencies

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

### Load Data

In [21]:
file = 'preprocessed_data.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Company Name,City,State,Shipping Service,Item/Bottle Count,Year,Month,Day
0,3,9646,9,14,3.0,2020,1,1
1,3,6269,4,14,10.0,2019,12,31
2,3,8382,34,1,8.0,2020,1,1
3,3,8382,34,1,6.0,2020,1,2
4,3,12968,20,4,12.0,2019,12,31


### Split Data into Training and Test sets
- Option 1: Split data by taking random samples from each month of the year (ex: use train_test_split on January data, then February data, then March data, etc.) And then combine all the samples into training data representing all months, and test data representing all months.

- Option 2: Split data by using the 2019 and 2020 data for training and the 2021 data for testing.

***Note: RandomForestRegressor does not require data to be scaled***


In [22]:
# Option 1: split data by month
jan_df = df.loc[df['Month'] == 1]
feb_df = df.loc[df['Month'] == 2]
mar_df = df.loc[df['Month'] == 3]
apr_df = df.loc[df['Month'] == 4]
may_df = df.loc[df['Month'] == 5]
june_df = df.loc[df['Month'] == 6]
july_df = df.loc[df['Month'] == 7]
aug_df = df.loc[df['Month'] == 8]
sep_df = df.loc[df['Month'] == 9]
oct_df = df.loc[df['Month'] == 10]
nov_df = df.loc[df['Month'] == 11]
dec_df = df.loc[df['Month'] == 12]


In [23]:
# Option 1 (cont.): get random training data from each month

# January
X_jan = jan_df.drop(columns=['Item/Bottle Count'], axis=1)
y_jan = jan_df['Item/Bottle Count']
X_train_jan, X_test_jan, y_train_jan, y_test_jan = train_test_split(X_jan, y_jan, random_state=1)

# February
X_feb = feb_df.drop(columns=['Item/Bottle Count'], axis=1)
y_feb = feb_df['Item/Bottle Count']
X_train_feb, X_test_feb, y_train_feb, y_test_feb = train_test_split(X_feb, y_feb, random_state=1)

# March
X_mar = mar_df.drop(columns=['Item/Bottle Count'], axis=1)
y_mar = mar_df['Item/Bottle Count']
X_train_mar, X_test_mar, y_train_mar, y_test_mar = train_test_split(X_mar, y_mar, random_state=1)

# April
X_apr = apr_df.drop(columns=['Item/Bottle Count'], axis=1)
y_apr = apr_df['Item/Bottle Count']
X_train_apr, X_test_apr, y_train_apr, y_test_apr = train_test_split(X_apr, y_apr, random_state=1)

# May
X_may = may_df.drop(columns=['Item/Bottle Count'], axis=1)
y_may = may_df['Item/Bottle Count']
X_train_may, X_test_may, y_train_may, y_test_may = train_test_split(X_may, y_may, random_state=1)

# June
X_june = june_df.drop(columns=['Item/Bottle Count'], axis=1)
y_june = june_df['Item/Bottle Count']
X_train_june, X_test_june, y_train_june, y_test_june = train_test_split(X_june, y_june, random_state=1)

# July
X_july = july_df.drop(columns=['Item/Bottle Count'], axis=1)
y_july = july_df['Item/Bottle Count']
X_train_july, X_test_july, y_train_july, y_test_july = train_test_split(X_july, y_july, random_state=1)

# August
X_aug = aug_df.drop(columns=['Item/Bottle Count'], axis=1)
y_aug = aug_df['Item/Bottle Count']
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug, y_aug, random_state=1)

# September
X_sep = sep_df.drop(columns=['Item/Bottle Count'], axis=1)
y_sep = sep_df['Item/Bottle Count']
X_train_sep, X_test_sep, y_train_sep, y_test_sep = train_test_split(X_sep, y_sep, random_state=1)

# October
X_oct = oct_df.drop(columns=['Item/Bottle Count'], axis=1)
y_oct = oct_df['Item/Bottle Count']
X_train_oct, X_test_oct, y_train_oct, y_test_oct = train_test_split(X_oct, y_oct, random_state=1)

# November
X_nov = nov_df.drop(columns=['Item/Bottle Count'], axis=1)
y_nov = nov_df['Item/Bottle Count']
X_train_nov, X_test_nov, y_train_nov, y_test_nov = train_test_split(X_nov, y_nov, random_state=1)

# December
X_dec = dec_df.drop(columns=['Item/Bottle Count'], axis=1)
y_dec = dec_df['Item/Bottle Count']
X_train_dec, X_test_dec, y_train_dec, y_test_dec = train_test_split(X_dec, y_dec, random_state=1)


In [24]:
# Option 1 (cont.): combine training and test data from all months into X_train_all, X_test_all, y_train_all, y_test_all
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

X_train_all = pd.concat([X_train_jan,
    X_train_feb,
    X_train_mar,
    X_train_apr,
    X_train_may,
    X_train_june,
    X_train_july,
    X_train_aug,
    X_train_sep,
    X_train_oct,
    X_train_nov,
    X_train_dec])

X_test_all = pd.concat([X_test_jan,
    X_test_feb,
    X_test_mar,
    X_test_apr,
    X_test_may,
    X_test_june,
    X_test_july,
    X_test_aug,
    X_test_sep,
    X_test_oct,
    X_test_nov,
    X_test_dec])

y_train_all = pd.concat([y_train_jan,
    y_train_feb,
    y_train_mar,
    y_train_apr,
    y_train_may,
    y_train_june,
    y_train_july,
    y_train_aug,
    y_train_sep,
    y_train_oct,
    y_train_nov,
    y_train_dec])

y_test_all = pd.concat([y_test_jan,
    y_test_feb,
    y_test_mar,
    y_test_apr,
    y_test_may,
    y_test_june,
    y_test_july,
    y_test_aug,
    y_test_sep,
    y_test_oct,
    y_test_nov,
    y_test_dec])

print(X_train_all.shape, y_train_all.shape,
    X_test_all.shape, y_test_all.shape)

(358986, 7) (358986,) (119668, 7) (119668,)


In [25]:
# CANNOT USE THIS OPTION UNLESS ALL DATA IS LOADED FROM DATABASE
# --------------------------------------------------------------
# Option 2: Train with 2019 and 2020, Test with 2021
train_2019_2020 = df.loc[(df['Year'] == 2019) | (df['Year'] == 2020)]
test_2021 = df.loc[df['Year'] == 2021]

X_train_2019_2020 = train_2019_2020.drop(columns=['Item/Bottle Count'], axis=1)
y_train_2019_2020 = train_2019_2020['Item/Bottle Count']

X_test_2021 = test_2021.drop(columns=['Item/Bottle Count'], axis=1)
y_test_2021 = test_2021['Item/Bottle Count']

### Build RandomForestRegressor Model

In [26]:
# Build model with different values for max_depth

regr_1 = RandomForestRegressor(max_depth=5)
regr_2 = RandomForestRegressor(max_depth=10)
regr_3 = RandomForestRegressor(max_depth=20)
regr_4 = RandomForestRegressor(max_depth=None)

regr_1.fit(X_train_all,y_train_all)
regr_2.fit(X_train_all,y_train_all)
regr_3.fit(X_train_all,y_train_all)
regr_4.fit(X_train_all,y_train_all)


regr_5 = RandomForestRegressor(max_depth=5)
regr_6 = RandomForestRegressor(max_depth=10)
regr_7 = RandomForestRegressor(max_depth=20)
regr_8 = RandomForestRegressor(max_depth=None)

regr_5.fit(X_train_2019_2020,y_train_2019_2020)
regr_6.fit(X_train_2019_2020,y_train_2019_2020)
regr_7.fit(X_train_2019_2020,y_train_2019_2020)
regr_8.fit(X_train_2019_2020,y_train_2019_2020)

# WILL TAKE ABOUT 6 MINUTES TO COMPLETE

In [27]:
y_pred1 = regr_1.predict(X_test_all)
y_pred2 = regr_2.predict(X_test_all)
y_pred3 = regr_3.predict(X_test_all)
y_pred4 = regr_4.predict(X_test_all)


y_pred5 = regr_5.predict(X_test_2021)
y_pred6 = regr_6.predict(X_test_2021)
y_pred7 = regr_7.predict(X_test_2021)
y_pred8 = regr_8.predict(X_test_2021)

### Evaluate the Model: How well does the model predict future outcomes?
- Mean Squared Error (MSE): mean or average of the squared differences between predicted and target values.
    - unit of measurement = squared 'Item/Bottle Count'
    - MSE = mean_squared_error(expected, predicted)
- Root Mean Squared Error (RMSE): square root of MSE.
    - unit of measurement = 'Item/Bottle Count'
    - RMSE = mean_squared_error(y_expected, y_predicted, squared=False)
- Mean Absolute Error (MAE): mean or average of the absolute error or difference between predicted and target values.
    - unit of measurement = 'Item/Bottle Count'
    - MAE = mean_absolute_error(y_expected, y_predicted)

In [30]:
# Scores for model with Option 1 Train/Test data

print("----------------------------------------")
print("Evaluate model: Option 1 Train/Test data")
print("----------------------------------------")
# Training Score
print("Training Score Max Depth=5: ", regr_1.score(X_train_all,y_train_all))
print("Training Score Max Depth=10: ", regr_2.score(X_train_all,y_train_all))
print("Training Score Max Depth=20: ", regr_3.score(X_train_all,y_train_all))
print("Training Score Max Depth=None: ", regr_4.score(X_train_all,y_train_all))

print("----------------------------------------")
# Test Score
print("Test Score Max Depth=5: ", regr_1.score(X_test_all, y_test_all))
print("Test Score Max Depth=10: ", regr_2.score(X_test_all, y_test_all))
print("Test Score Max Depth=20: ", regr_3.score(X_test_all, y_test_all))
print("Test Score Max Depth=None: ", regr_4.score(X_test_all, y_test_all))

print("----------------------------------------")
# Mean Squared Error
print("Mean Squared Error Max Depth=5: ", mean_squared_error(y_test_all, y_pred1))
print("Mean Squared Error Max Depth=10: ", mean_squared_error(y_test_all, y_pred2))
print("Mean Squared Error Max Depth=20: ", mean_squared_error(y_test_all, y_pred3))
print("Mean Squared Error Max Depth=None: ", mean_squared_error(y_test_all, y_pred4))

print("----------------------------------------")
# Root Mean Squared Error
print("Root Mean Squared Error Max Depth=5: ", mean_squared_error(y_test_all, y_pred1, squared=False))
print("Root Mean Squared Error Max Depth=10: ", mean_squared_error(y_test_all, y_pred2, squared=False))
print("Root Mean Squared Error Max Depth=20: ", mean_squared_error(y_test_all, y_pred3, squared=False))
print("Root Mean Squared Error Max Depth=None: ", mean_squared_error(y_test_all, y_pred4, squared=False))

print("----------------------------------------")
# Mean Absolute Error
print("Mean Absolute Error Max Depth=5: ", mean_absolute_error(y_test_all, y_pred1))
print("Mean Absolute Error Max Depth=10: ", mean_absolute_error(y_test_all, y_pred2))
print("Mean Absolute Error Max Depth=20: ", mean_absolute_error(y_test_all, y_pred3))
print("Mean Absolute Error Max Depth=None: ", mean_absolute_error(y_test_all, y_pred4))

----------------------------------------
Evaluate model: Option 1 Train/Test data
----------------------------------------
Training Score Max Depth=5:  0.41799973996294515
Training Score Max Depth=10:  0.7068096593528773
Training Score Max Depth=20:  0.7524609209454094
Training Score Max Depth=None:  0.7916640434142472
----------------------------------------
Test Score Max Depth=5:  0.25485260617660177
Test Score Max Depth=10:  0.2698024836923968
Test Score Max Depth=20:  0.2656228097527007
Test Score Max Depth=None:  0.2314309730510855
----------------------------------------
Mean Squared Error Max Depth=5:  60.38003131232445
Mean Squared Error Max Depth=10:  59.16862793092437
Mean Squared Error Max Depth=20:  59.507311049789266
Mean Squared Error Max Depth=None:  62.27790944117102
----------------------------------------
Root Mean Squared Error Max Depth=5:  7.770458886856326
Root Mean Squared Error Max Depth=10:  7.692114659241915
Root Mean Squared Error Max Depth=20:  7.7140982006

In [31]:
# Scores for model with Option 2 Train/Test data

print("----------------------------------------")
print("Evaluate model: Option 2 Train/Test data")
print("----------------------------------------")
# Training Score
print("Training Score Max Depth=5: ", regr_5.score(X_train_2019_2020,y_train_2019_2020))
print("Training Score Max Depth=10: ", regr_6.score(X_train_2019_2020,y_train_2019_2020))
print("Training Score Max Depth=20: ", regr_7.score(X_train_2019_2020,y_train_2019_2020))
print("Training Score Max Depth=None: ", regr_8.score(X_train_2019_2020,y_train_2019_2020))

print("----------------------------------------")
# Test Score
print("Test Score Max Depth=5: ", regr_5.score(X_test_2021, y_test_2021))
print("Test Score Max Depth=10: ", regr_6.score(X_test_2021, y_test_2021))
print("Test Score Max Depth=20: ", regr_7.score(X_test_2021, y_test_2021))
print("Test Score Max Depth=None: ", regr_8.score(X_test_2021, y_test_2021))

print("----------------------------------------")
# Mean Squared Error
print("Mean Squared Error Max Depth=5: ", mean_squared_error(y_test_2021, y_pred5))
print("Mean Squared Error Max Depth=10: ", mean_squared_error(y_test_2021, y_pred6))
print("Mean Squared Error Max Depth=20: ", mean_squared_error(y_test_2021, y_pred7))
print("Mean Squared Error Max Depth=None: ", mean_squared_error(y_test_2021, y_pred8))

print("----------------------------------------")
# Root Mean Squared Error
print("Root Mean Squared Error Max Depth=5: ", mean_squared_error(y_test_2021, y_pred5, squared=False))
print("Root Mean Squared Error Max Depth=10: ", mean_squared_error(y_test_2021, y_pred6, squared=False))
print("Root Mean Squared Error Max Depth=20: ", mean_squared_error(y_test_2021, y_pred7, squared=False))
print("Root Mean Squared Error Max Depth=None: ", mean_squared_error(y_test_2021, y_pred8, squared=False))

print("----------------------------------------")
# Mean Absolute Error
print("Mean Absolute Error Max Depth=5: ", mean_absolute_error(y_test_2021, y_pred5))
print("Mean Absolute Error Max Depth=10: ", mean_absolute_error(y_test_2021, y_pred6))
print("Mean Absolute Error Max Depth=20: ", mean_absolute_error(y_test_2021, y_pred7))
print("Mean Absolute Error Max Depth=None: ", mean_absolute_error(y_test_2021, y_pred8))

----------------------------------------
Evaluate model: Option 2 Train/Test data
----------------------------------------
Training Score Max Depth=5:  0.5574286989188226
Training Score Max Depth=10:  0.6034232696785777
Training Score Max Depth=20:  0.6451802895359999
Training Score Max Depth=None:  0.6713198992686655
----------------------------------------
Test Score Max Depth=5:  -4.886126551373792
Test Score Max Depth=10:  -5.881360271429575
Test Score Max Depth=20:  -3.001394405890035
Test Score Max Depth=None:  -4.11551642837502
----------------------------------------
Mean Squared Error Max Depth=5:  207.26432449066684
Mean Squared Error Max Depth=10:  242.30883855221913
Mean Squared Error Max Depth=20:  140.8984841421675
Mean Squared Error Max Depth=None:  180.12933423944085
----------------------------------------
Root Mean Squared Error Max Depth=5:  14.396677550416515
Root Mean Squared Error Max Depth=10:  15.566272468135045
Root Mean Squared Error Max Depth=20:  11.87006672