### Import Dependencies

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

### Load Data

In [13]:
file = 'preprocessed_data.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,Company Name,City,State,Shipping Service,Created Date,Item/Bottle Count
0,0,3,7981,9,14,1/1/20,3.0
1,1,3,5188,4,14,12/31/19,10.0
2,2,3,6939,34,1,1/1/20,8.0
3,3,3,6939,34,1,1/2/20,6.0
4,4,3,10666,20,4,12/31/19,12.0


### Split Data into Training and Test sets
- RandomForest cannot extrapolate values outside of what is in the training data, so we must include random samples from every month of the year in the training set.

- We will not scale the data as it is unnecessary for RandomForest models.

In [14]:
# convert 'Created Date' to datetime format
# May 2020 has weird values (ex: 43940.85348)
df['created_date_converted'] = pd.to_datetime(df['Created Date'], infer_datetime_format=True)
df['created_date_converted']


In [None]:
# split features data by month
jan_df = df.loc[df['created_date_converted'].dt.month == 1]
feb_df = df.loc[df['created_date_converted'].dt.month == 2]
mar_df = df.loc[df['created_date_converted'].dt.month == 3]
apr_df = df.loc[df['created_date_converted'].dt.month == 4]
may_df = df.loc[df['created_date_converted'].dt.month == 5]
june_df = df.loc[df['created_date_converted'].dt.month == 6]
july_df = df.loc[df['created_date_converted'].dt.month == 7]
aug_df = df.loc[df['created_date_converted'].dt.month == 8]
sep_df = df.loc[df['created_date_converted'].dt.month == 9]
oct_df = df.loc[df['created_date_converted'].dt.month == 10]
nov_df = df.loc[df['created_date_converted'].dt.month == 11]
dec_df = df.loc[df['created_date_converted'].dt.month == 12]


In [None]:
# get random training data from each month
X_jan = jan_df.drop(columns=['Item/Bottle Count'], axis=1)
y_jan = jan_df['Item/Bottle Count']
X_train_jan, X_test_jan, y_train_jan, y_test_jan = train_test_split(X_jan, y_jan, random_state=1)


In [15]:
# # split into features and target
# X = df.drop(columns=['Item/Bottle Count'], axis=1)
# y = df['Item/Bottle Count']

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Build RandomForestRegressor Model

In [16]:
# # Fit classifier models with different max_depth
# regr_1 = RandomForestRegressor(max_depth=5)
# regr_2 = RandomForestRegressor(max_depth=10)
# regr_3 = RandomForestRegressor(max_depth=20)
# regr_4 = RandomForestRegressor(max_depth=None)

# regr_1.fit(X_train_scaled,y_train)
# regr_2.fit(X_train_scaled,y_train)
# regr_3.fit(X_train_scaled,y_train)
# regr_4.fit(X_train_scaled,y_train)

In [17]:
# y_pred1 = regr_1.predict(X_test_scaled)
# y_pred2 = regr_2.predict(X_test_scaled)
# y_pred3 = regr_3.predict(X_test_scaled)
# y_pred4 = regr_4.predict(X_test_scaled)

### Print R-Squared Scores (how well does the model predict the target variable)

In [18]:
# # Training Score
# print("Training Score Max Depth=5: ", regr_1.score(X_train_scaled,y_train))
# print("Training Score Max Depth=10: ", regr_2.score(X_train_scaled,y_train))
# print("Training Score Max Depth=20: ", regr_3.score(X_train_scaled,y_train))
# print("Training Score Max Depth=None: ", regr_4.score(X_train_scaled,y_train))

# # Test Score
# print("Test Score Max Depth=5: ", regr_1.score(X_test_scaled, y_test))
# print("Test Score Max Depth=10: ", regr_2.score(X_test_scaled, y_test))
# print("Test Score Max Depth=20: ", regr_3.score(X_test_scaled, y_test))
# print("Test Score Max Depth=None: ", regr_4.score(X_test_scaled, y_test))