In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [2]:
#  import csv
df = pd.read_csv('../Resources/final_data.csv')
df.head()

Unnamed: 0,county,year,violent_crime,murder,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,...,frm_15,points_15,median_hh_income,median_hh_inc_moe,poverty_count,poverty_count_moe,poverty_rate,poverty_rate_moe,county_fips,price
0,Atlantic,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,62678,2822,29057,4251,1.6,11.3,1,196067.42
1,Bergen,2019,2,0,2,0,0,46,2,44,...,3.391731,0.475,107971,3025,52980,7662,0.8,5.7,3,494018.42
2,Burlington,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,88443,3233,24961,4374,1.0,5.7,5,238593.67
3,Camden,2019,2,0,0,0,2,44,4,38,...,3.391731,0.475,73168,2374,53641,7048,1.4,10.7,7,181980.75
4,Cape May,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,66565,4753,8853,1981,2.2,9.8,9,389294.58


In [3]:
#  drop nulls
df.dropna(how='any', inplace = True)

In [29]:
# Set features. This will also be used as x values.
X = df.drop(['county', 'county_fips'], axis=1)
y = df["county"]
print(X.shape, y.shape)

(164, 22) (164,)


In [30]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# X_scaler = MinMaxScaler().fit(X_train)
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [31]:
X_train.columns

Index(['year', 'violent_crime', 'murder', 'rape', 'robbery',
       'aggravated_assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson', 'frm_30', 'points_30', 'frm_15',
       'points_15', 'median_hh_income', 'median_hh_inc_moe', 'poverty_count',
       'poverty_count_moe', 'poverty_rate', 'poverty_rate_moe', 'price'],
      dtype='object')

In [32]:
# Fit the data into model
rfm = RandomForestClassifier(n_estimators=200)
rfm.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200)

In [33]:
sorted(zip(rfm.feature_importances_, X.columns), reverse=True)

[(0.14783315712721984, 'poverty_count_moe'),
 (0.14751062902836565, 'poverty_count'),
 (0.13053290200220777, 'price'),
 (0.12205276033148163, 'poverty_rate_moe'),
 (0.11136272906128586, 'poverty_rate'),
 (0.09419814261859433, 'median_hh_income'),
 (0.052478093195785405, 'median_hh_inc_moe'),
 (0.030224564613754893, 'larceny_theft'),
 (0.028809680752102784, 'property_crime'),
 (0.02204828910888317, 'violent_crime'),
 (0.01800900950397425, 'aggravated_assault'),
 (0.013884609134868163, 'frm_15'),
 (0.012957844063682751, 'burglary'),
 (0.012796668482574769, 'points_15'),
 (0.011915301879782054, 'frm_30'),
 (0.011115173120287989, 'points_30'),
 (0.011020038494646642, 'year'),
 (0.009679533805200946, 'motor_vehicle_theft'),
 (0.00707654734577621, 'robbery'),
 (0.0017948112212933431, 'rape'),
 (0.0014440478645742696, 'arson'),
 (0.001255467243657271, 'murder')]

In [34]:
predictions = rfm.predict(X_test)
base_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
base_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.926829268292683


In [35]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    Atlantic       1.00      1.00      1.00         2
      Bergen       1.00      1.00      1.00         1
  Burlington       1.00      1.00      1.00         2
      Camden       1.00      0.50      0.67         2
    Cape May       1.00      1.00      1.00         1
  Cumberland       1.00      1.00      1.00         4
       Essex       1.00      1.00      1.00         1
  Gloucester       1.00      1.00      1.00         1
      Hudson       1.00      1.00      1.00         3
   Hunterdon       1.00      1.00      1.00         1
      Mercer       1.00      1.00      1.00         2
   Middlesex       1.00      1.00      1.00         4
    Monmouth       0.67      1.00      0.80         2
      Morris       1.00      1.00      1.00         2
       Ocean       0.67      0.67      0.67         3
     Passaic       1.00      1.00      1.00         2
       Salem       1.00      1.00      1.00         1
      Sussex       1.00    

# Hyperparameter Tuning

In [36]:
# Get randomforest params
rfm.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [40]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 600, 1200, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}
grid = GridSearchCV(rfm, param_grid, cv=3, verbose=0)

In [41]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 600, 1200, 100]})

In [42]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 600}
0.9430894308943089


In [45]:
rfm = RandomForestClassifier(n_estimators=600,criterion= 'gini', max_features= 'auto' )
rfm.fit(X_train, y_train)

RandomForestClassifier(n_estimators=600)

In [46]:
predictions = rfm.predict(X_test)
tuned_train_accuracy = round(rfm.score(X_train, y_train)*100,3)
tuned_test_accuracy = round(rfm.score(X_test, y_test)*100,3)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9512195121951219


In [47]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    Atlantic       1.00      1.00      1.00         2
      Bergen       1.00      1.00      1.00         1
  Burlington       1.00      1.00      1.00         2
      Camden       1.00      0.50      0.67         2
    Cape May       1.00      1.00      1.00         1
  Cumberland       1.00      1.00      1.00         4
       Essex       1.00      1.00      1.00         1
  Gloucester       1.00      1.00      1.00         1
      Hudson       1.00      1.00      1.00         3
   Hunterdon       1.00      1.00      1.00         1
      Mercer       1.00      1.00      1.00         2
   Middlesex       1.00      1.00      1.00         4
    Monmouth       1.00      1.00      1.00         2
      Morris       1.00      1.00      1.00         2
       Ocean       0.67      0.67      0.67         3
     Passaic       1.00      1.00      1.00         2
       Salem       1.00      1.00      1.00         1
      Sussex       1.00    

In [48]:
#  save the model
filename = '../Models/NJ_rfm.sav'
joblib.dump(grid, filename)

['../Models/NJ_rfm.sav']

In [49]:
predictions = rfm.predict(X_test)
print(f"Predicted Labels: {predictions[:5]}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted Labels: ['Burlington' 'Cumberland' 'Union' 'Camden' 'Morris']
Actual Labels: ['Burlington', 'Cumberland', 'Union', 'Camden', 'Morris']


# Model evaluation

In [50]:
evaluations = {'': ['Base Train Model', 'Base Test Model', 'Tuned Train Model', 'Tuned Test Model'],
               'RF Accuracy': [f"{base_train_accuracy}%", f"{base_test_accuracy}%", f"{tuned_train_accuracy}%", f"{tuned_test_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('../Resources/RFM_eval.csv')
evaluations_df

Unnamed: 0,RF Accuracy
,
Base Train Model,100.0%
Base Test Model,92.683%
Tuned Train Model,100.0%
Tuned Test Model,95.122%
