## UTMCC DataViz Module 20 Team Project. -- Supervised Machine Learning Modeling.  
### Food Deserts in the Austin, Texas Metro Area

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy import stats
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations
# Import checkpoint dependencies
import os
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from collections import Counter
import datetime as dt
import calendar
import random
from path import Path

%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import statistics
from flask import Flask, jsonify
from mpl_toolkits.mplot3d import Axes3D
# from hvplot import hvPlot
# import hvplot.pandas
# import plotly.express as px

# Python SQL toolkit and Object Relational Mapper
import sqlite3
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import extract

import warnings
warnings.filterwarnings('ignore')

### Import and read csv files

In [6]:
# Import and read csv files
food_atlas_df = pd.read_csv("resources/food_access_research_atlas.csv")
# food_var_df = pd.read_csv("resources/food_access_variable_lookup.csv")

In [7]:
food_atlas_df.head()

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga,1,1912,693,0,0,0.0,0,...,221,1622,217,14,0,14,45,44,26,112
1,1001020200,Alabama,Autauga,1,2170,743,0,181,0.08341,0,...,214,888,1217,5,0,5,55,75,87,202
2,1001020300,Alabama,Autauga,1,3373,1256,0,0,0.0,0,...,439,2576,647,17,5,11,117,87,108,120
3,1001020400,Alabama,Autauga,1,4386,1722,0,0,0.0,0,...,904,4086,193,18,4,11,74,85,19,82
4,1001020500,Alabama,Autauga,1,10766,4082,0,181,0.016812,0,...,1126,8666,1437,296,9,48,310,355,198,488


## Preprocessing the data

In [8]:
food_atlas_df.dtypes

CensusTract       int64
State            object
County           object
Urban             int64
POP2010           int64
                  ...  
TractAIAN         int64
TractOMultir      int64
TractHispanic     int64
TractHUNV         int64
TractSNAP         int64
Length: 147, dtype: object

In [9]:
# Create new df keeping only Texas
food_texas_df = food_atlas_df[(food_atlas_df["State"]=="Texas")]
food_texas_df

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
61468,48001950100,Texas,Anderson,0,4685,1874,0,49,0.010459,0,...,912,4012,452,22,0,13,186,236,125,218
61469,48001950401,Texas,Anderson,0,5422,77,1,5219,0.962560,0,...,24,1825,2266,21,0,5,1305,1324,5,0
61470,48001950402,Texas,Anderson,0,7535,83,1,7315,0.970803,0,...,134,2591,3248,13,0,10,1673,1737,0,0
61471,48001950500,Texas,Anderson,1,4377,1604,0,86,0.019648,1,...,627,2737,800,19,2,20,799,1389,66,288
61472,48001950600,Texas,Anderson,1,6405,2253,0,96,0.014988,1,...,791,3831,1674,68,4,43,785,1253,194,412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66721,48505950400,Texas,Zapata,0,5610,1741,0,0,0.000000,1,...,555,5264,5,12,0,30,299,5176,58,539
66722,48507950100,Texas,Zavala,0,1232,388,0,0,0.000000,1,...,125,1056,4,0,0,2,170,1104,55,164
66723,48507950200,Texas,Zavala,0,1880,590,0,0,0.000000,1,...,203,1612,21,2,9,13,223,1635,59,155
66724,48507950301,Texas,Zavala,0,2254,628,0,348,0.154392,0,...,225,2096,14,1,0,3,140,2109,160,384


In [10]:
# Create new df keeping only select Counties in the Austin Metro Area
counties = ["Bastrop", "Caldwell", "Hays", "Travis", "Williamson"]
food_austin_df = food_texas_df.loc[food_texas_df["County"].isin(counties)]
food_austin_df

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
61529,48021950100,Texas,Bastrop,0,8608,3063,0,0,0.000000,0,...,1028,6507,383,53,4,65,1596,2660,51,351
61530,48021950200,Texas,Bastrop,1,7955,2625,0,186,0.023381,1,...,891,4521,1328,32,14,76,1984,3674,148,444
61531,48021950300,Texas,Bastrop,0,12927,4734,0,71,0.005492,0,...,1468,9971,1213,156,18,88,1481,3012,95,568
61532,48021950400,Texas,Bastrop,1,7984,3127,0,456,0.057114,0,...,1199,6312,800,78,3,62,729,1711,255,261
61533,48021950501,Texas,Bastrop,0,8008,2168,0,1519,0.189685,0,...,566,5209,849,37,12,123,1778,3253,128,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66673,48491021507,Texas,Williamson,1,9196,3293,0,1,0.000109,0,...,444,6315,1227,421,15,44,1174,2710,179,277
66674,48491021508,Texas,Williamson,1,6531,2402,0,0,0.000000,0,...,301,4744,761,303,7,28,688,1627,23,133
66675,48491021601,Texas,Williamson,0,3278,1120,0,0,0.000000,1,...,236,2569,102,19,6,15,567,954,5,26
66676,48491021602,Texas,Williamson,0,2857,959,0,0,0.000000,0,...,332,2464,47,19,0,16,311,906,12,106


In [11]:
# Begin Income column creation (target variable), MFI = MedianFamilyIncome']
food_austinMFI_df = food_austin_df
food_austinMFI_df['MedianFamilyIncome']

61529    70516
61530    67792
61531    75462
61532    62375
61533    65079
         ...  
66673    66820
66674    83241
66675    57389
66676    63125
66677    58902
Name: MedianFamilyIncome, Length: 350, dtype: int64

In [12]:
conditions = [(food_austinMFI_df['MedianFamilyIncome'] <= 15000), 
              (food_austinMFI_df['MedianFamilyIncome'] > 15000)]
# values = ['Impovrished', 'Not Impovrished']
values = [1, 0]
food_austinMFI_df["Income"] = np.select(conditions, values)
food_austinMFI_df.head()

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP,Income
61529,48021950100,Texas,Bastrop,0,8608,3063,0,0,0.0,0,...,6507,383,53,4,65,1596,2660,51,351,0
61530,48021950200,Texas,Bastrop,1,7955,2625,0,186,0.023381,1,...,4521,1328,32,14,76,1984,3674,148,444,0
61531,48021950300,Texas,Bastrop,0,12927,4734,0,71,0.005492,0,...,9971,1213,156,18,88,1481,3012,95,568,0
61532,48021950400,Texas,Bastrop,1,7984,3127,0,456,0.057114,0,...,6312,800,78,3,62,729,1711,255,261,0
61533,48021950501,Texas,Bastrop,0,8008,2168,0,1519,0.189685,0,...,5209,849,37,12,123,1778,3253,128,285,0


In [13]:
food_austinMFI_df.dtypes

CensusTract       int64
State            object
County           object
Urban             int64
POP2010           int64
                  ...  
TractOMultir      int64
TractHispanic     int64
TractHUNV         int64
TractSNAP         int64
Income            int32
Length: 148, dtype: object

In [14]:
# export to csv 
food_austinMFI_df.to_csv("food_desert_austinMFI.csv")

In [15]:
# Create new df with select feature columns
food_desert_Austin_df = food_austinMFI_df[["Income","lasnaphalf", "lasnaphalfshare", "lahunvhalf", "lahunvhalfshare", "lasnap1", "lasnap1share", "lahunv1", "lahunv1share", "lasnap10", "lasnap10share", "lahunv10", "lahunv10share", "lasnap20", "lasnap20share",  "lahunv20", "lahunv20share"]]
food_desert_Austin_df

Unnamed: 0,Income,lasnaphalf,lasnaphalfshare,lahunvhalf,lahunvhalfshare,lasnap1,lasnap1share,lahunv1,lahunv1share,lasnap10,lasnap10share,lahunv10,lahunv10share,lasnap20,lasnap20share,lahunv20,lahunv20share
61529,0,325.133846,0.106149,48.548857,0.015850,319.907937,0.104443,47.889963,0.015635,5.159378,0.001684,0.922672,0.000301,0.0,0.0,0.0,0.0
61530,0,368.584869,0.140413,120.671667,0.045970,233.501673,0.088953,81.658810,0.031108,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
61531,0,590.025451,0.124636,100.788957,0.021290,549.545092,0.116085,84.946165,0.017944,48.577263,0.010261,8.752010,0.001849,0.0,0.0,0.0,0.0
61532,0,241.111748,0.077106,191.810180,0.061340,188.984455,0.060436,114.453080,0.036602,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
61533,0,307.709167,0.141932,144.460704,0.066633,307.703411,0.141930,144.458152,0.066632,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66673,0,253.269297,0.076911,147.346423,0.044745,200.157469,0.060783,94.092539,0.028574,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
66674,0,65.461476,0.027253,3.168106,0.001319,37.486129,0.015606,0.346386,0.000144,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
66675,0,24.931507,0.022260,6.618853,0.005910,24.931507,0.022260,6.618853,0.005910,15.243361,0.013610,2.243933,0.002004,0.0,0.0,0.0,0.0
66676,0,104.153688,0.108607,13.420352,0.013994,104.153688,0.108607,13.420352,0.013994,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [16]:
# export to csv 
# food_desert_Austin_df.to_csv("food_desert_Austin.csv")

## Machine Learning Model, using Austin-Metro only data.

In [17]:
# Split the preprocessed dataframe into our features and target arrays
#  Remove Income target from features data
y = food_desert_Austin_df.Income.values
X = food_desert_Austin_df.drop(columns="Income").values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Create a Random Forest classifier, 64 estimators.
rf_model = RandomForestClassifier(n_estimators=64, random_state=78)
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.977


In [20]:
# Logistic Regression Model Accuracy
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)
log_classifier.fit(X_train,y_train)
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.977


In [21]:
# # Create new df with select feature columns for all of U.S. (not only for Texas), with 72,864 rows
# # Use this df as ML Training set
# food_desertUS_df = food_atlasMFI_df[["Income","lasnaphalf", "lasnaphalfshare", "lahunvhalf", "lahunvhalfshare", "lasnap1", "lasnap1share", "lahunv1", "lahunv1share", "lasnap10", "lasnap10share", "lahunv10", "lahunv10share", "lasnap20", "lasnap20share",  "lahunv20", "lahunv20share"]]
# food_desertUS_df

In [22]:
# # export to csv
# food_desertUS_df.to_csv("food_desertUS.csv")

In [23]:
# # Split the preprocessed dataframe into our features and target arrays
# #  Remove Income target from features data
# y = food_desertUS_df.Income.values
# X = food_desertUS_df.drop(columns="Income").values
# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [24]:
# # Create StandardScaler instances
# scaler = StandardScaler()
# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)
# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

## Supervised ML Models
### Easy Ensemble AdaBoost Classifier

In [25]:
# Train the EasyEnsembleClassifier.  EasyEnsembleClassifier(n_estimators=100, random_state=1)
adaboost = AdaBoostClassifier(n_estimators=1000, learning_rate=1,random_state=1)
model = adaboost.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [26]:
# Calculate the balanced accuracy score.
balanced_accuracy_score(y_test, y_pred)

0.5

In [27]:
# Calculate the accuracy score - for Reference. 
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.9772727272727273

In [28]:
# Display the confusion matrix.
cm = confusion_matrix(y_test, y_pred)
# cm

In [29]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# print("Confusion Matrix")
# cm_df

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        86
           1       0.00      0.00      0.00         2

    accuracy                           0.98        88
   macro avg       0.49      0.50      0.49        88
weighted avg       0.96      0.98      0.97        88



In [31]:
# # Print the imbalanced classification report
# print("Imbalanced Classification Report")
# print(classification_report_imbalanced(y_test, y_pred))

In [32]:
print("Easy Ensemble AdaBoost Classifier")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}")

print(f"Confusion Matrix: ")
display(cm_df)

print("Imbalanced Classification Report: ")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Balanced Accuracy Score: 0.5
Confusion Matrix: 


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,86,0
Actual 1,2,0


Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00        86
          1       0.00      0.00      1.00      0.00      0.00      0.00         2

avg / total       0.96      0.98      0.02      0.97      0.00      0.00        88



### Balanced Random Forest Classifier

In [33]:
# Resample the training data with the BalancedRandomForestClassifier. BalancedRandomForestClassifier(random_state=1)
rf_model = RandomForestClassifier(n_estimators=500, random_state=1) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [34]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [35]:
# Calculate the balanced accuracy score.
balanced_accuracy_score(y_test, predictions)

0.5

In [36]:
# Display the confusion matrix.
cm = confusion_matrix(y_test, predictions)
# cm

In [37]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,86,0
Actual 1,2,0


In [38]:
print("Balanced Random Forest Classifier")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, predictions)}")

print(f"Confusion Matrix: ")
display(cm_df)

print("Imbalanced Classification Report: ")
print(classification_report_imbalanced(y_test, predictions))

Balanced Random Forest Classifier
Balanced Accuracy Score: 0.5
Confusion Matrix: 


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,86,0
Actual 1,2,0


Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00        86
          1       0.00      0.00      1.00      0.00      0.00      0.00         2

avg / total       0.96      0.98      0.02      0.97      0.00      0.00        88



In [41]:
# Calculate feature importance in the Random Forest model.
importances = [rf_model.feature_importances_]
importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [40]:
# List the features sorted in descending order by feature importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances = pd.DataFrame(rf_model.feature_importances_, index=X_train.columns, columns=['Importances']).sort_values('Importances', ascending=False)
importances.head(15)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'