In [88]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
import altair as alt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [89]:
# Load the wrangled and split data
avocado = pd.read_csv("avocado_train_clean.csv")
avocado2 = pd.read_csv("avocado_train_raw.csv")
# must load in avocado_test 

In [20]:
# Which features do we want to keep?
avocado.columns
# want month, region, type

Index(['X1', 'Date', 'AveragePrice', 'Total Volume', 'Total Bags', 'type',
       'year', 'region', 'PLU', 'no_sold', 'bag_size', 'bags_sold', 'month',
       'year_month'],
      dtype='object')

In [80]:
# Split the data into target and features
avocado_y = avocado['AveragePrice']
avocado_x = avocado[['region', 'type', 'month']]

# need to convert categorical to numerical using one-hot-encoding
categorical_features = ['region', 'type', 'month']

preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(drop="first"), categorical_features)
])

avocado_x = preprocessor.fit_transform(avocado_x)

# must also transform the x_test data

#avocado_test_x = pd.DataFrame(preprocessor.transform(avocado_test_x),
#                              index=avocado_test_x.index,
#                              columns=avocado_x.columns)

In [81]:
# fit rfr model
rfr = RandomForestRegressor()
rfr.fit(avocado_x, avocado_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [83]:
# find optimal hyperparameters

rfr_parameters = {'max_depth': range(1, 20),
                  'n_estimators': range(1, 100)}

RandomizedSearchCV(rfr, rfr_parameters,
             cv=5).fit(avocado_x, avocado_y)

In [62]:
avocado

Unnamed: 0,X1,Date,AveragePrice,Total Volume,Total Bags,type,year,region,PLU,no_sold,bag_size,bags_sold,month,year_month
0,1,2015-12-20,1.35,54876.98,9505.56,conventional,2015,Albany,4046,674.28,Small Bags,9408.07,Dec,2015-12
1,2,2015-12-13,0.93,118220.22,8145.35,conventional,2015,Albany,4046,794.70,Small Bags,8042.21,Dec,2015-12
2,3,2015-12-06,1.08,78992.15,5811.16,conventional,2015,Albany,4046,1132.00,Small Bags,5677.40,Dec,2015-12
3,4,2015-11-29,1.28,51039.60,6183.95,conventional,2015,Albany,4046,941.48,Small Bags,5986.26,Nov,2015-11
4,6,2015-11-15,0.99,83453.76,8318.86,conventional,2015,Albany,4046,1368.92,Small Bags,8196.81,Nov,2015-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131390,5,2018-02-18,1.56,17597.12,13776.71,organic,2018,WestTexNewMexico,4770,0.00,XLarge Bags,0.00,Feb,2018-02
131391,7,2018-02-04,1.63,17074.83,13498.67,organic,2018,WestTexNewMexico,4770,0.00,XLarge Bags,0.00,Feb,2018-02
131392,8,2018-01-28,1.71,13888.04,9264.84,organic,2018,WestTexNewMexico,4770,0.00,XLarge Bags,0.00,Jan,2018-01
131393,9,2018-01-21,1.87,13766.76,9394.11,organic,2018,WestTexNewMexico,4770,727.94,XLarge Bags,0.00,Jan,2018-01


In [36]:
rfr.feature_importances_

array([0.00362001, 0.00381387, 0.00384277, 0.00516104, 0.00582373,
       0.0028342 , 0.0092982 , 0.00523171, 0.00535618, 0.00417197,
       0.01473986, 0.00590664, 0.00360519, 0.00296486, 0.00206718,
       0.00399987, 0.03170659, 0.0191726 , 0.00260061, 0.00394234,
       0.00665036, 0.00702894, 0.00326149, 0.00253224, 0.00108631,
       0.00534748, 0.00234568, 0.01821714, 0.00870506, 0.00230928,
       0.00347041, 0.01011997, 0.01764205, 0.00307753, 0.00142268,
       0.0051306 , 0.00650568, 0.00286029, 0.00415068, 0.01079123,
       0.00434373, 0.03107275, 0.00538699, 0.00150416, 0.01383668,
       0.00138741, 0.00671488, 0.00244471, 0.00512981, 0.00185128,
       0.00193968, 0.00539364, 0.0124447 , 0.5384703 , 0.01407611,
       0.00382699, 0.00866664, 0.00576042, 0.00806428, 0.00438672,
       0.00279491, 0.00322644, 0.00741946, 0.02649319, 0.02084961])

In [85]:
list(zip(feature_list, rfr.feature_importances_))

[('region_Albany', 0.003642420271223255),
 ('region_Atlanta', 0.003862730438812623),
 ('region_BaltimoreWashington', 0.0038297064142683783),
 ('region_Boise', 0.005211331234078844),
 ('region_Boston', 0.005849615589820036),
 ('region_BuffaloRochester', 0.002807831695679179),
 ('region_California', 0.009253725572062157),
 ('region_Charlotte', 0.005220019578309617),
 ('region_Chicago', 0.005256480444419881),
 ('region_CincinnatiDayton', 0.004116928739247078),
 ('region_Columbus', 0.01475884662263401),
 ('region_DallasFtWorth', 0.006011350009515482),
 ('region_Denver', 0.003532694562228657),
 ('region_Detroit', 0.0030726286216170486),
 ('region_GrandRapids', 0.00203895261746701),
 ('region_GreatLakes', 0.004060034011414796),
 ('region_HarrisburgScranton', 0.03177927521409502),
 ('region_HartfordSpringfield', 0.01919939355328656),
 ('region_Houston', 0.0025913293447321047),
 ('region_Indianapolis', 0.003987324014118162),
 ('region_Jacksonville', 0.006580194074187059),
 ('region_LasVegas', 

In [54]:
features = pd.get_dummies(avocado[['region', 'type', 'month']])
feature_list = list(features.columns)

['region_Albany', 'region_Atlanta', 'region_BaltimoreWashington', 'region_Boise', 'region_Boston', 'region_BuffaloRochester', 'region_California', 'region_Charlotte', 'region_Chicago', 'region_CincinnatiDayton', 'region_Columbus', 'region_DallasFtWorth', 'region_Denver', 'region_Detroit', 'region_GrandRapids', 'region_GreatLakes', 'region_HarrisburgScranton', 'region_HartfordSpringfield', 'region_Houston', 'region_Indianapolis', 'region_Jacksonville', 'region_LasVegas', 'region_LosAngeles', 'region_Louisville', 'region_MiamiFtLauderdale', 'region_Midsouth', 'region_Nashville', 'region_NewOrleansMobile', 'region_NewYork', 'region_Northeast', 'region_NorthernNewEngland', 'region_Orlando', 'region_Philadelphia', 'region_PhoenixTucson', 'region_Pittsburgh', 'region_Plains', 'region_Portland', 'region_RaleighGreensboro', 'region_RichmondNorfolk', 'region_Roanoke', 'region_Sacramento', 'region_SanDiego', 'region_SanFrancisco', 'region_Seattle', 'region_SouthCarolina', 'region_SouthCentral', 

In [69]:
rfr.score(avocado_x, avocado_y)

0.7043800061235217

In [74]:
lr = LinearRegression()
lr.fit(avocado_x, avocado_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [75]:
lr.score(avocado_x, avocado_y)

0.6078288201088087

In [79]:
list(zip(feature_list, lr.coef_))

[('region_Albany', -0.22398985024759535),
 ('region_Atlanta', -0.023940270098505547),
 ('region_BaltimoreWashington', -0.20713715989451348),
 ('region_Boise', -0.031748084799057115),
 ('region_Boston', -0.04403079844011538),
 ('region_BuffaloRochester', -0.16575368048122055),
 ('region_California', 0.045763506763110896),
 ('region_Charlotte', -0.005320359078595685),
 ('region_Chicago', -0.34970535113860535),
 ('region_CincinnatiDayton', -0.31197654961569043),
 ('region_Columbus', -0.47345230908200653),
 ('region_DallasFtWorth', -0.3438154205246774),
 ('region_Denver', -0.2848558613854846),
 ('region_Detroit', -0.06140603371257497),
 ('region_GrandRapids', -0.2215630838009708),
 ('region_GreatLakes', -0.04433344065372997),
 ('region_HarrisburgScranton', 0.2574802522876479),
 ('region_HartfordSpringfield', -0.5133138647905581),
 ('region_Houston', -0.242619410540572),
 ('region_Indianapolis', -0.05266871296219488),
 ('region_Jacksonville', -0.18104589278440525),
 ('region_LasVegas', -0.3

In [94]:
# Split the data into target and features
avocado_train_y2 = avocado2['AveragePrice']
avocado_train_x2 = avocado2[['region', 'type', 'month']]

# need to convert categorical to numerical using one-hot-encoding
categorical_features = ['region', 'type', 'month']

preprocessor = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(drop="first"), categorical_features)
])

avocado_train_x2 = preprocessor.fit_transform(avocado_train_x2)

# must also transform the x_test data

#avocado_test_x = pd.DataFrame(preprocessor.transform(avocado_test_x),
#                              index=avocado_test_x.index,
#                              columns=avocado_x.columns)

In [95]:
rfr.fit(avocado_train_x2, avocado_train_y2)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [96]:
# find optimal hyperparameters
RandomizedSearchCV(rfr, rfr_parameters,
             cv=5).fit(avocado_train_x2, avocado_train_y2)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [97]:
list(zip(feature_list, rfr.feature_importances_))

[('region_Albany', 0.00432286639227454),
 ('region_Atlanta', 0.0034503022922991533),
 ('region_BaltimoreWashington', 0.004651477422238678),
 ('region_Boise', 0.0055698639337435196),
 ('region_Boston', 0.006170938087110058),
 ('region_BuffaloRochester', 0.003155223078158226),
 ('region_California', 0.009594271839611245),
 ('region_Charlotte', 0.00577901037843646),
 ('region_Chicago', 0.0061827270818456655),
 ('region_CincinnatiDayton', 0.004770331973116497),
 ('region_Columbus', 0.01556173126070064),
 ('region_DallasFtWorth', 0.007072410509158438),
 ('region_Denver', 0.003883927798539314),
 ('region_Detroit', 0.004056556003938128),
 ('region_GrandRapids', 0.0025887281316199764),
 ('region_GreatLakes', 0.0032210161835153047),
 ('region_HarrisburgScranton', 0.030582435756993192),
 ('region_HartfordSpringfield', 0.017961054243405255),
 ('region_Houston', 0.0028062642573921845),
 ('region_Indianapolis', 0.004593657956603099),
 ('region_Jacksonville', 0.006498824299481722),
 ('region_LasVega