In [76]:
import pandas as pd
import numpy as np
import bamboolib

In [236]:
cali = pd.read_csv('cali_2011_2018.csv', dtype={'geo_id': str})

In [237]:
features_test = cali.loc[cali['Date'] == 2018]
features_train = cali.loc[cali['Date'] != 2018]

In [238]:
#drop columns with 100% na (for test)
features_test = features_test.drop(columns=['households_retirement_income','commute_5_9_mins','amerindian_including_hispanic', 'asian_including_hispanic', 'black_including_hispanic', 'commute_35_39_mins', 'commute_40_44_mins', 'commute_60_89_mins', 'commute_90_more_mins'])
features_train = features_train.drop(columns=['households_retirement_income','commute_5_9_mins','amerindian_including_hispanic', 'asian_including_hispanic', 'black_including_hispanic', 'commute_35_39_mins', 'commute_40_44_mins', 'commute_60_89_mins', 'commute_90_more_mins'])

In [239]:
#drop columns with over 10% missing values (for train)
features_train = features_train.drop(columns=['bachelors_degree', 'associates_degree'])
features_test = features_test.drop(columns=['bachelors_degree', 'associates_degree'])

In [240]:
#drop zip codes 
features_train = features_train.drop(columns=['RegionName'])
features_test = features_test.drop(columns=['RegionName'])

In [241]:
#converting inf values to na
features_train[features_train == np.inf] = np.nan
features_test[features_test == np.inf] = np.nan

In [242]:
#fill missing values with the column mean 
features_test = features_test.fillna(features_test.mean())
features_train = features_train.fillna(features_train.mean())

In [243]:
# Labels are the values we want to predict
labels_train = features_train['avg_rent']
labels_test = features_test['avg_rent']

In [244]:
# Remove the labels from the features
# axis 1 refers to the columns
features_train = features_train.drop('avg_rent', axis = 1)
features_test = features_test.drop('avg_rent', axis = 1)

In [231]:
labels_test

8113    2317.250000
8114    2300.333333
8115    2225.916667
8116    2778.083333
8117    2764.250000
           ...     
9267    2942.000000
9268    2375.111111
9269    2276.583333
9270    1946.750000
9271    2400.416667
Name: avg_rent, Length: 1159, dtype: float64

In [191]:
# Saving feature names for later use
feature_train_list = list(features_train.columns)
feature_test_list = list(features_test.columns)

# Convert to numpy array
#features_train = np.array(features_train)
#features_test = np.array(features_test)

In [194]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(features_train, labels_train);

In [195]:
rf.score(features_train, labels_train)

0.9936359331129521

In [232]:
labels_test.isnull().sum()

0

In [234]:
labels_test

8113    2317.250000
8114    2300.333333
8115    2225.916667
8116    2778.083333
8117    2764.250000
           ...     
9267    2942.000000
9268    2375.111111
9269    2276.583333
9270    1946.750000
9271    2400.416667
Name: avg_rent, Length: 1159, dtype: float64

In [208]:
features_test.replace([np.inf, -np.inf], np.nan, inplace=True)
labels_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [245]:
rf.score(features_test, labels_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [203]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(features_train, labels_train)