In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [2]:
#  import csv
df = pd.read_csv('../Resources/final_data.csv')
df.head()

Unnamed: 0,county,year,violent_crime,murder,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,...,frm_15,points_15,median_hh_income,median_hh_inc_moe,poverty_count,poverty_count_moe,poverty_rate,poverty_rate_moe,county_fips,price
0,Atlantic,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,62678,2822,29057,4251,1.6,11.3,1,196067.42
1,Bergen,2019,2,0,2,0,0,46,2,44,...,3.391731,0.475,107971,3025,52980,7662,0.8,5.7,3,494018.42
2,Burlington,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,88443,3233,24961,4374,1.0,5.7,5,238593.67
3,Camden,2019,2,0,0,0,2,44,4,38,...,3.391731,0.475,73168,2374,53641,7048,1.4,10.7,7,181980.75
4,Cape May,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,66565,4753,8853,1981,2.2,9.8,9,389294.58


In [3]:
#  drop nulls
df.dropna(how='any', inplace = True)

In [4]:
le = LabelEncoder()
df['county'] = le.fit_transform(df['county'])

In [5]:
# df= pd.get_dummies(df)
# df.head()

In [6]:
df.columns

Index(['county', 'year', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson', 'frm_30', 'points_30', 'frm_15',
       'points_15', 'median_hh_income', 'median_hh_inc_moe', 'poverty_count',
       'poverty_count_moe', 'poverty_rate', 'poverty_rate_moe', 'county_fips',
       'price'],
      dtype='object')

In [7]:
X = df.drop(["price", 'county_fips'], axis=1)
y = df["price"]
print(X.shape, y.shape)

(164, 22) (164,)


In [8]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
X_train.columns

Index(['county', 'year', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson', 'frm_30', 'points_30', 'frm_15',
       'points_15', 'median_hh_income', 'median_hh_inc_moe', 'poverty_count',
       'poverty_count_moe', 'poverty_rate', 'poverty_rate_moe'],
      dtype='object')

In [10]:
# Fit the data into model
rfm = RandomForestRegressor(n_estimators=200)
rfm.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200)

In [11]:
sorted(zip(rfm.feature_importances_, X.columns), reverse=True)

[(0.2846869087676386, 'median_hh_income'),
 (0.2576220744093777, 'poverty_rate'),
 (0.1862823012622801, 'poverty_count_moe'),
 (0.09178812681399347, 'county'),
 (0.0832059721096721, 'poverty_count'),
 (0.04637015146810329, 'poverty_rate_moe'),
 (0.010138760142277247, 'median_hh_inc_moe'),
 (0.009646855508658085, 'year'),
 (0.005028514723692544, 'points_15'),
 (0.004672610189353079, 'points_30'),
 (0.0039097272471618994, 'violent_crime'),
 (0.0025156223610249113, 'aggravated_assault'),
 (0.0022883369753584275, 'property_crime'),
 (0.0022232107466449345, 'larceny_theft'),
 (0.0020645250788658928, 'frm_30'),
 (0.001871823648046364, 'frm_15'),
 (0.0016238847975872845, 'burglary'),
 (0.001356990361020993, 'motor_vehicle_theft'),
 (0.0009456655389491987, 'arson'),
 (0.0007632861497231665, 'rape'),
 (0.0006640094135786903, 'robbery'),
 (0.00033064228699207043, 'murder')]

In [12]:
predictions = rfm.predict(X_test)
base_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
base_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 0.9721879613099971
Testing Data Score: 0.8152753276092031


In [13]:
from sklearn.metrics import mean_squared_error as MSE

In [14]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error",MSE_score.mean())

Mean Squared Error 1610873782.8143485


# Hyperparameter Tuning

In [15]:
# Get randomforest params
rfm.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 100, 150],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['mse', 'mae'],
    'max_depth': [None, 5, 10]
}
grid = GridSearchCV(rfm, param_grid, cv=5, verbose=0)

In [17]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(n_estimators=200),
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [None, 5, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 100, 150]})

In [18]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'mae', 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 100}
0.7700413833165237


In [34]:
rfm = RandomForestRegressor(n_estimators=100,criterion= 'mae', max_features= 'auto', max_depth= 10 )
rfm.fit(X_train, y_train)

RandomForestRegressor(criterion='mae', max_depth=10)

In [35]:
predictions = rfm.predict(X_test)
tuned_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
tuned_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 0.9787874886589915
Testing Data Score: 0.8474072451574186


In [36]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error",MSE_score.mean())

Mean Squared Error 1330670478.6207967


In [37]:
#  save the model
filename = '../Models/NJ_rfm_house_price.sav'
joblib.dump(grid, filename)

['../Models/NJ_rfm_house_price.sav']

In [38]:
predictions = rfm.predict(X_test)
print(f"Predicted Labels: {predictions[:5]}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted Labels: [298608.6945  150360.5322  269932.57835 196847.0576  404045.4064 ]
Actual Labels: [224566.75, 137949.92, 310472.92, 165169.0, 394941.0]


# Model evaluation

In [39]:
evaluations = {'': ['Base Train Model', 'Base Test Model', 'Tuned Train Model', 'Tuned Test Model'],
               'RF Accuracy': [f"{base_train_accuracy}%", f"{base_test_accuracy}%", f"{tuned_train_accuracy}%", f"{tuned_test_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('../Resources/RFM_eval_house_price.csv')
evaluations_df

Unnamed: 0,RF Accuracy
,
Base Train Model,97.219%
Base Test Model,81.528%
Tuned Train Model,97.879%
Tuned Test Model,84.741%
