In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [2]:
#  import csv
df = pd.read_csv('../Resources/final_data.csv')
df.head()

Unnamed: 0,county,year,violent_crime,murder,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,...,frm_15,points_15,median_hh_income,median_hh_inc_moe,poverty_count,poverty_count_moe,poverty_rate,poverty_rate_moe,county_fips,price
0,Atlantic,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,62678,2822,29057,4251,1.6,11.3,1,196067.42
1,Bergen,2019,2,0,2,0,0,46,2,44,...,3.391731,0.475,107971,3025,52980,7662,0.8,5.7,3,494018.42
2,Burlington,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,88443,3233,24961,4374,1.0,5.7,5,238593.67
3,Camden,2019,2,0,0,0,2,44,4,38,...,3.391731,0.475,73168,2374,53641,7048,1.4,10.7,7,181980.75
4,Cape May,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,66565,4753,8853,1981,2.2,9.8,9,389294.58


In [3]:
#  drop nulls
df.dropna(how='any', inplace = True)

In [4]:
le = LabelEncoder()
df['county'] = le.fit_transform(df['county'])

In [5]:
# df= pd.get_dummies(df)
# df.head()

In [6]:
df.columns

Index(['county', 'year', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson', 'frm_30', 'points_30', 'frm_15',
       'points_15', 'median_hh_income', 'median_hh_inc_moe', 'poverty_count',
       'poverty_count_moe', 'poverty_rate', 'poverty_rate_moe', 'county_fips',
       'price'],
      dtype='object')

In [7]:
X = df.drop(["price", 'county_fips'], axis=1)
y = df["price"]
print(X.shape, y.shape)

(164, 22) (164,)


In [8]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
X_train.columns

Index(['county', 'year', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson', 'frm_30', 'points_30', 'frm_15',
       'points_15', 'median_hh_income', 'median_hh_inc_moe', 'poverty_count',
       'poverty_count_moe', 'poverty_rate', 'poverty_rate_moe'],
      dtype='object')

In [10]:
# Fit the data into model
rfm = RandomForestRegressor(n_estimators=200)
rfm.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200)

In [11]:
sorted(zip(rfm.feature_importances_, X.columns), reverse=True)

[(0.3064266894657471, 'median_hh_income'),
 (0.23122684538580748, 'poverty_rate'),
 (0.18072407602715412, 'poverty_count_moe'),
 (0.09201262395498105, 'poverty_count'),
 (0.0821428074756532, 'county'),
 (0.05068823363251795, 'poverty_rate_moe'),
 (0.013887271690195428, 'median_hh_inc_moe'),
 (0.010340646429595606, 'year'),
 (0.006059289446165756, 'points_15'),
 (0.004407240975340744, 'points_30'),
 (0.0037265937672828495, 'violent_crime'),
 (0.002888441190314431, 'larceny_theft'),
 (0.0023929574072755356, 'property_crime'),
 (0.002224497228122884, 'frm_30'),
 (0.002211526014624833, 'motor_vehicle_theft'),
 (0.0021827513209132857, 'frm_15'),
 (0.0017262624946621354, 'burglary'),
 (0.0014261669753228062, 'aggravated_assault'),
 (0.00131663550759126, 'rape'),
 (0.0009730967504049081, 'murder'),
 (0.0007940841840535005, 'robbery'),
 (0.0002212626762732266, 'arson')]

In [12]:
predictions = rfm.predict(X_test)
base_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
base_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 0.9703931467186633
Testing Data Score: 0.8264450879539609


In [None]:
from sklearn.metrics import mean_squared_error as MSE

In [None]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error",MSE_score.mean())

# Hyperparameter Tuning

In [None]:
# Get randomforest params
rfm.get_params()

In [None]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 100, 150],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['mse', 'mae'],
    'max_depth': [None, 5, 10]
}
grid = GridSearchCV(rfm, param_grid, cv=5, verbose=0)

In [None]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
rfm = RandomForestRegressor(n_estimators=200,criterion= 'mae', max_features= 'auto' )
rfm.fit(X_train, y_train)

In [None]:
predictions = rfm.predict(X_test)
tuned_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
tuned_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

In [None]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error",MSE_score.mean())

In [None]:
#  save the model
filename = '../Models/NJ_rfm_house_price.sav'
joblib.dump(grid, filename)

In [None]:
predictions = rfm.predict(X_test)
print(f"Predicted Labels: {predictions[:5]}")
print(f"Actual Labels: {list(y_test[:5])}")

# Model evaluation

In [None]:
evaluations = {'': ['Base Train Model', 'Base Test Model', 'Tuned Train Model', 'Tuned Test Model'],
               'RF Accuracy': [f"{base_train_accuracy}%", f"{base_test_accuracy}%", f"{tuned_train_accuracy}%", f"{tuned_test_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('../Resources/RFM_eval_house_price.csv')
evaluations_df