In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from pyarrow import feather

In [7]:
# load in feather datasets 
# training dataset
avocado = pd.read_feather("../data/train.feather")
avocado.head()
# test dataset
# avocado_test = pd.read_feather("test.feather")
# avocado_test.head()

Unnamed: 0,X1,date,average_price,total_volume,PLU_4046,PLU_4225,PLU_4770,total_bags,small_bags,large_bags,xlarge_bags,type,year,region,month,year_month
0,0.0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015.0,Albany,Dec,2015-12
1,1.0,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015.0,Albany,Dec,2015-12
2,2.0,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015.0,Albany,Dec,2015-12
3,3.0,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015.0,Albany,Dec,2015-12
4,4.0,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015.0,Albany,Nov,2015-11


In [8]:
# Which features do we want to keep?
avocado.columns
# want month, region, type as our features

Index(['X1', 'date', 'average_price', 'total_volume', 'PLU_4046', 'PLU_4225',
       'PLU_4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags',
       'type', 'year', 'region', 'month', 'year_month'],
      dtype='object')

In [10]:
# Split the data into target and features
avocado_x = avocado[['region', 'type', 'month']]
avocado_y = avocado['average_price']

# avocado_test_x = avocado_test[['region', 'type', 'month']]
# avocado_test_y = avocado['AveragePrice']

# need to convert categorical to numerical using one-hot-encoding
categorical_features = ['region', 'type', 'month']

preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(drop="first"), categorical_features)
])

# applying one hot encoding to the training features
avocado_x = preprocessor.fit_transform(avocado_x)

# apply the same transformation to the test features (but don't fit!)

# avocado_test_x = pd.DataFrame(preprocessor.transform(avocado_test_x),
#                              index=avocado_test_x.index,
#                              columns=avocado_x.columns)

In [19]:
# fit rfr model
rfr = RandomForestRegressor()
rfr.fit(avocado_x, avocado_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [20]:
fold_accuracies = cross_val_score(estimator=rfr, X=avocado_x, y=avocado_y, cv=5)
print(fold_accuracies)
print(np.mean(fold_accuracies))
print(np.std(fold_accuracies))
# the standard deviation is much greater than the mean accuracy, 
# this is probably not a good model...

[-0.99710882  0.16182704  0.52131528 -0.19385108  0.17763216]
-0.0660370854441678
0.5175926113359542


In [21]:
# find optimal hyperparameters

rfr_parameters = {'max_depth': range(1, 20),
                  'n_estimators': range(1, 100)}

random_rfr = RandomizedSearchCV(rfr, rfr_parameters, n_iter=10,
             cv=5, scoring='neg_mean_squared_error')

random_rfr.fit(avocado_x, avocado_y)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [22]:
features = pd.get_dummies(avocado[['region', 'type', 'month']])
feature_list = list(features.columns)

In [23]:
list(zip(feature_list, rfr.feature_importances_))

[('region_Albany', 0.0037480012284520563),
 ('region_Atlanta', 0.004090309689735909),
 ('region_BaltimoreWashington', 0.005346279151496791),
 ('region_Boise', 0.005042642589697402),
 ('region_Boston', 0.006002577347143965),
 ('region_BuffaloRochester', 0.0033752913384945313),
 ('region_California', 0.010965908961202194),
 ('region_Charlotte', 0.0060603543127698755),
 ('region_Chicago', 0.006084255012448982),
 ('region_CincinnatiDayton', 0.004417577482803759),
 ('region_Columbus', 0.014726094731566905),
 ('region_DallasFtWorth', 0.006027220205557338),
 ('region_Denver', 0.0041828767594310435),
 ('region_Detroit', 0.003576106990033672),
 ('region_GrandRapids', 0.002218941532252344),
 ('region_GreatLakes', 0.0035509209437789447),
 ('region_HarrisburgScranton', 0.03158065434778951),
 ('region_HartfordSpringfield', 0.018350575895272194),
 ('region_Houston', 0.0025961573059661534),
 ('region_Indianapolis', 0.005038896901106402),
 ('region_Jacksonville', 0.007683326499878723),
 ('region_LasVe

In [24]:
rfr.score(avocado_x, avocado_y)

0.7098740329320563

In [25]:
lr = LinearRegression()
lr.fit(avocado_x, avocado_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
lr.score(avocado_x, avocado_y)

0.6093976419611092

In [27]:
list(zip(feature_list, lr.coef_))

[('region_Albany', -0.21956689704181856),
 ('region_Atlanta', -0.02063048139350373),
 ('region_BaltimoreWashington', -0.2252418053543965),
 ('region_Boise', -0.03542732401517944),
 ('region_Boston', -0.04607431124402879),
 ('region_BuffaloRochester', -0.16470506225733672),
 ('region_California', 0.06159784503280273),
 ('region_Charlotte', 5.4598699459447435e-05),
 ('region_Chicago', -0.35467143646478966),
 ('region_CincinnatiDayton', -0.3076291833538522),
 ('region_Columbus', -0.47154125332270175),
 ('region_DallasFtWorth', -0.33197930758110905),
 ('region_Denver', -0.27899749448048494),
 ('region_Detroit', -0.059215353412605545),
 ('region_GrandRapids', -0.21923596071127344),
 ('region_GreatLakes', -0.05203421669515846),
 ('region_HarrisburgScranton', 0.2673703795934898),
 ('region_HartfordSpringfield', -0.5060012988056523),
 ('region_Houston', -0.2397526611836631),
 ('region_Indianapolis', -0.04520831253474802),
 ('region_Jacksonville', -0.1752177232435257),
 ('region_LasVegas', -0.3