### Import Dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

### Load Data

In [2]:
file = 'preprocessed_data.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Company Name,Ship Date,City,State,Shipping Service,Created Date,Weight,Item/Bottle Count
0,1,43952.87139,1555,34,5,43940.85348,10.5,3
1,1,43952.87149,5113,44,1,43943.52123,20.2,6
2,1,43952.87149,1341,5,9,43945.43794,20.2,6
3,1,43952.87148,4881,4,4,43945.60456,10.5,3
4,1,43952.87148,4984,44,1,43947.43795,39.0,12


### Scale and Split Data
##### Need to drop 'Weight' column completely because it has float variables and cannot be predicted in the classifier model without additional preprocessing

In [3]:
# split into features and target
X = df.drop(columns=['Weight','Item/Bottle Count'], axis=1)
y = df['Item/Bottle Count']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [4]:
# scale the data
scaler = MinMaxScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Build RandomForestRegressor Model

In [6]:
# Fit classifier models with different max_depth
regr_1 = RandomForestRegressor(max_depth=5)
regr_2 = RandomForestRegressor(max_depth=10)
regr_3 = RandomForestRegressor(max_depth=20)
regr_4 = RandomForestRegressor(max_depth=None)

regr_1.fit(X_train_scaled,y_train)
regr_2.fit(X_train_scaled,y_train)
regr_3.fit(X_train_scaled,y_train)
regr_4.fit(X_train_scaled,y_train)

RandomForestRegressor()

In [7]:
y_pred1 = regr_1.predict(X_test_scaled)
y_pred2 = regr_2.predict(X_test_scaled)
y_pred3 = regr_3.predict(X_test_scaled)
y_pred4 = regr_4.predict(X_test_scaled)

### Print R-Squared Scores (how well does the model predict the target variable)

In [8]:
# Training Score
print("Training Score Max Depth=5: ", regr_1.score(X_train_scaled,y_train))
print("Training Score Max Depth=10: ", regr_2.score(X_train_scaled,y_train))
print("Training Score Max Depth=20: ", regr_3.score(X_train_scaled,y_train))
print("Training Score Max Depth=None: ", regr_4.score(X_train_scaled,y_train))

# Test Score
print("Test Score Max Depth=5: ", regr_1.score(X_test_scaled, y_test))
print("Test Score Max Depth=10: ", regr_2.score(X_test_scaled, y_test))
print("Test Score Max Depth=20: ", regr_3.score(X_test_scaled, y_test))
print("Test Score Max Depth=None: ", regr_4.score(X_test_scaled, y_test))

Training Score Max Depth=5:  0.5588446031272207
Training Score Max Depth=10:  0.6315532982823922
Training Score Max Depth=20:  0.8515773928355911
Training Score Max Depth=None:  0.9320415869860483
Test Score Max Depth=5:  0.5454299090285035
Test Score Max Depth=10:  0.5623558180748012
Test Score Max Depth=20:  0.5639402566209031
Test Score Max Depth=None:  0.5528493726055184
