In [26]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [27]:
dat=gpd.read_file("./net_work_analysis/tracts_dist.geojson")

In [28]:
BQ=dat[dat.TOT_POP>0]
cols=['MED_INC','prt_whi','prt_asi','prt_bla','prt_veh','pop_den','cnt','fast_food','market','convenience','supermarket','FoodDesertOrNot']
model=BQ[cols]

Use GridSearchCV to improve the parameter of the model

In [29]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=27,ratio=1.0)
y=model.FoodDesertOrNot
x=model.drop("FoodDesertOrNot",axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train,y_train=sm.fit_sample(x_train,y_train)

pipe1=make_pipeline(StandardScaler(),RandomForestClassifier(random_state=42))

In [30]:
# Make the grid of parameters to search
# NOTE: you must prepend the name of the pipeline step
model_name = "randomforestclassifier"
param_grid = {
    f"{model_name}__n_estimators": [5, 10, 15, 20, 30, 50, 100],
    f"{model_name}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

param_grid

{'randomforestclassifier__n_estimators': [5, 10, 15, 20, 30, 50, 100],
 'randomforestclassifier__max_depth': [2, 5, 7, 9, 13, 21, 33, 51]}

In [31]:
grid=GridSearchCV(pipe1,param_grid,cv=10)
grid.fit(x_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                  

In [33]:
grid.best_params_

{'randomforestclassifier__max_depth': 21,
 'randomforestclassifier__n_estimators': 50}

1. RandomForestClassifier with class_weight= 'balanced'

In [38]:
#make pipe lines
from sklearn.pipeline import make_pipeline

y=model.FoodDesertOrNot
x=model.drop("FoodDesertOrNot",axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

pipe=make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=100,random_state=42,class_weight='balanced'))
pipe.fit(x_train,y_train)

y_pred=pipe.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.92      0.96       411
           1       0.16      0.67      0.26         9

    accuracy                           0.92       420
   macro avg       0.58      0.79      0.61       420
weighted avg       0.97      0.92      0.94       420



In [39]:
confusion_matrix(y_test, y_pred)

array([[379,   3],
       [ 32,   6]], dtype=int64)

2. Generate synthetic samples and use Random Forest

In [35]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=27,ratio=1.0)
y=model.FoodDesertOrNot
x=model.drop("FoodDesertOrNot",axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train,y_train=sm.fit_sample(x_train,y_train)


pipe1=make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=50,max_depth=21,random_state=42))
pipe1.fit(x_train,y_train)

y_pred=pipe1.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       382
           1       0.50      0.50      0.50        38

    accuracy                           0.91       420
   macro avg       0.73      0.73      0.73       420
weighted avg       0.91      0.91      0.91       420



In [36]:
from sklearn.metrics import confusion_matrix

In [37]:
confusion_matrix(y_test, y_pred)

array([[363,  19],
       [ 19,  19]], dtype=int64)