In [3]:
import pandas as pd
import numpy as np
import bamboolib

In [4]:
cali = pd.read_csv('cali_2011_2018.csv', dtype={'geo_id': str})

In [5]:
features_test = cali.loc[cali['Date'] == 2018]
features_train = cali.loc[cali['Date'] != 2018]

In [6]:
#drop columns with 100% na (for test)
features_test = features_test.drop(columns=['households_retirement_income','commute_5_9_mins','amerindian_including_hispanic', 'asian_including_hispanic', 'black_including_hispanic', 'commute_35_39_mins', 'commute_40_44_mins', 'commute_60_89_mins', 'commute_90_more_mins'])
features_train = features_train.drop(columns=['households_retirement_income','commute_5_9_mins','amerindian_including_hispanic', 'asian_including_hispanic', 'black_including_hispanic', 'commute_35_39_mins', 'commute_40_44_mins', 'commute_60_89_mins', 'commute_90_more_mins'])

In [7]:
#A better way to drop columns with 100% na..
features_test = features_test.dropna(axis=1, how='all')
features_train = features_train.dropna(axis=1, how='all')

In [8]:
#drop columns with over 10% missing values (for train)
features_train = features_train.drop(columns=['bachelors_degree', 'associates_degree'])
features_test = features_test.drop(columns=['bachelors_degree', 'associates_degree'])

In [9]:
#drop zip codes 
features_train = features_train.drop(columns=['RegionName'])
features_test = features_test.drop(columns=['RegionName'])

In [10]:
#converting inf values to na
features_train[features_train == np.inf] = np.nan
features_test[features_test == np.inf] = np.nan

In [11]:
#fill missing values with the column mean 
features_test = features_test.fillna(features_test.mean())
features_train = features_train.fillna(features_train.mean())

In [12]:
# Labels are the values we want to predict
labels_train = features_train['avg_rent']
labels_test = features_test['avg_rent']

In [13]:
# Remove the labels from the features
# axis 1 refers to the columns
features_train = features_train.drop('avg_rent', axis = 1)
features_test = features_test.drop('avg_rent', axis = 1)

In [14]:
# Saving feature names for later use
feature_train_list = list(features_train.columns)
feature_test_list = list(features_test.columns)

# Convert to numpy array
#features_train = np.array(features_train)
#features_test = np.array(features_test)

In [15]:
features_train.sum()

Date                             1.633958e+07
RegionID                         7.995578e+08
SizeRank                         3.769665e+07
aggregate_travel_time_to_work    3.197904e+09
amerindian_pop                   8.625172e+05
                                     ...     
white_male_45_54                 8.012285e+06
white_male_55_64                 7.555049e+06
white_pop                        9.995323e+07
worked_at_home                   5.916451e+06
workers_16_and_over              1.135857e+08
Length: 242, dtype: float64

In [16]:
features_test.sum() #problem! we have more columns in train than in test

Date                             2.338862e+06
RegionID                         1.142225e+08
SizeRank                         5.385236e+06
aggregate_travel_time_to_work    5.142971e+08
amerindian_pop                   1.169139e+05
                                     ...     
white_male_45_54                 1.026238e+06
white_male_55_64                 1.106979e+06
white_pop                        1.409066e+07
worked_at_home                   9.799335e+05
workers_16_and_over              1.740324e+07
Length: 229, dtype: float64

In [17]:
#problem solved. 
features_train = features_train.drop(columns=[col for col in features_train if col not in feature_test_list])
features_train.sum()

Date                             1.633958e+07
RegionID                         7.995578e+08
SizeRank                         3.769665e+07
aggregate_travel_time_to_work    3.197904e+09
amerindian_pop                   8.625172e+05
                                     ...     
white_male_45_54                 8.012285e+06
white_male_55_64                 7.555049e+06
white_pop                        9.995323e+07
worked_at_home                   5.916451e+06
workers_16_and_over              1.135857e+08
Length: 229, dtype: float64

In [18]:
labels_test

8113    2317.250000
8114    2300.333333
8115    2225.916667
8116    2778.083333
8117    2764.250000
           ...     
9267    2942.000000
9268    2375.111111
9269    2276.583333
9270    1946.750000
9271    2400.416667
Name: avg_rent, Length: 1159, dtype: float64

In [21]:
features_train.to_csv('features_train_ca.csv', index=False)
features_test.to_csv('features_test_ca.csv', index=False)
labels_train.to_csv('labels_train_ca.csv', index=False)
labels_test.to_csv('labels_test_ca.csv', index=False)

In [22]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(features_train, labels_train);

In [23]:
rf.score(features_train, labels_train)

0.9936552184628115

In [24]:
rf.score(features_test, labels_test)

0.9452651979297022

In [25]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [26]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 50 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(features_train, labels_train)

In [None]:
rf_random.score(features_train, labels_train)

In [None]:
rf_random.score(features_test, labels_test)

In [None]:
rf_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [120, 130, 140, None],
    'max_features': ['sqrt', 100, 200, 230, 240],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1, 2, 3],
    'n_estimators': [300, 350, 400, 450]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(features_train, labels_train)
grid_search.set_params(verbose=1)
grid_search.best_params_

In [None]:
grid_search.score(features_train, labels_train)

In [None]:
grid_search.score(features_test, labels_test)

In [None]:
# get importance
importance = grid_search.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()