## UTMCC DataViz Module 20 Team Project
### Food Deserts in the Austin, Texas Metro Area

In [71]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy import stats
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

In [82]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from collections import Counter
import datetime as dt
import calendar
import random
from path import Path

%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import statistics
from flask import Flask, jsonify
from mpl_toolkits.mplot3d import Axes3D
from hvplot import hvPlot
import hvplot.pandas
import plotly.express as px

# Python SQL toolkit and Object Relational Mapper
import sqlite3
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import extract

import warnings
warnings.filterwarnings('ignore')

In [52]:
# Import and read csv files
food_atlas_df = pd.read_csv("resources/food_access_research_atlas.csv")
# food_var_df = pd.read_csv("resources/food_access_variable_lookup.csv")

In [53]:
food_atlas_df.head()

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga,1,1912,693,0,0,0.0,0,...,221,1622,217,14,0,14,45,44,26,112
1,1001020200,Alabama,Autauga,1,2170,743,0,181,0.08341,0,...,214,888,1217,5,0,5,55,75,87,202
2,1001020300,Alabama,Autauga,1,3373,1256,0,0,0.0,0,...,439,2576,647,17,5,11,117,87,108,120
3,1001020400,Alabama,Autauga,1,4386,1722,0,0,0.0,0,...,904,4086,193,18,4,11,74,85,19,82
4,1001020500,Alabama,Autauga,1,10766,4082,0,181,0.016812,0,...,1126,8666,1437,296,9,48,310,355,198,488


In [7]:
# food_var_df.head()

## Preprocessing the data

In [29]:
food_atlas_df.dtypes

CensusTract       int64
State            object
County           object
Urban             int64
POP2010           int64
                  ...  
TractAIAN         int64
TractOMultir      int64
TractHispanic     int64
TractHUNV         int64
TractSNAP         int64
Length: 147, dtype: object

In [9]:
# Create new df keeping only Texas
food_texas_df = food_atlas_df[(food_atlas_df["State"]=="Texas")]
food_texas_df

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
61468,48001950100,Texas,Anderson,0,4685,1874,0,49,0.010459,0,...,912,4012,452,22,0,13,186,236,125,218
61469,48001950401,Texas,Anderson,0,5422,77,1,5219,0.962560,0,...,24,1825,2266,21,0,5,1305,1324,5,0
61470,48001950402,Texas,Anderson,0,7535,83,1,7315,0.970803,0,...,134,2591,3248,13,0,10,1673,1737,0,0
61471,48001950500,Texas,Anderson,1,4377,1604,0,86,0.019648,1,...,627,2737,800,19,2,20,799,1389,66,288
61472,48001950600,Texas,Anderson,1,6405,2253,0,96,0.014988,1,...,791,3831,1674,68,4,43,785,1253,194,412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66721,48505950400,Texas,Zapata,0,5610,1741,0,0,0.000000,1,...,555,5264,5,12,0,30,299,5176,58,539
66722,48507950100,Texas,Zavala,0,1232,388,0,0,0.000000,1,...,125,1056,4,0,0,2,170,1104,55,164
66723,48507950200,Texas,Zavala,0,1880,590,0,0,0.000000,1,...,203,1612,21,2,9,13,223,1635,59,155
66724,48507950301,Texas,Zavala,0,2254,628,0,348,0.154392,0,...,225,2096,14,1,0,3,140,2109,160,384


In [10]:
# Create new df keeping only select Counties in the Austin Metro Area
counties = ["Bastrop", "Caldwell", "Hays", "Travis", "Williamson"]
food_austin_df = food_texas_df.loc[food_texas_df["County"].isin(counties)]
food_austin_df

Unnamed: 0,CensusTract,State,County,Urban,POP2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
61529,48021950100,Texas,Bastrop,0,8608,3063,0,0,0.000000,0,...,1028,6507,383,53,4,65,1596,2660,51,351
61530,48021950200,Texas,Bastrop,1,7955,2625,0,186,0.023381,1,...,891,4521,1328,32,14,76,1984,3674,148,444
61531,48021950300,Texas,Bastrop,0,12927,4734,0,71,0.005492,0,...,1468,9971,1213,156,18,88,1481,3012,95,568
61532,48021950400,Texas,Bastrop,1,7984,3127,0,456,0.057114,0,...,1199,6312,800,78,3,62,729,1711,255,261
61533,48021950501,Texas,Bastrop,0,8008,2168,0,1519,0.189685,0,...,566,5209,849,37,12,123,1778,3253,128,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66673,48491021507,Texas,Williamson,1,9196,3293,0,1,0.000109,0,...,444,6315,1227,421,15,44,1174,2710,179,277
66674,48491021508,Texas,Williamson,1,6531,2402,0,0,0.000000,0,...,301,4744,761,303,7,28,688,1627,23,133
66675,48491021601,Texas,Williamson,0,3278,1120,0,0,0.000000,1,...,236,2569,102,19,6,15,567,954,5,26
66676,48491021602,Texas,Williamson,0,2857,959,0,0,0.000000,0,...,332,2464,47,19,0,16,311,906,12,106


In [28]:
# export to csv for food_Austin_censustract to csv
food_austin_df.to_csv("food_desert_austin_censustract.csv")


In [11]:
# Create new df with select feature columns
food_desert_df = food_austin_df[["LILATracts_1And10", "lasnaphalf", "lasnaphalfshare", "lahunvhalf", "lahunvhalfshare", "lasnap1", "lasnap1share", "lahunv1", "lahunv1share", "lasnap10", "lasnap10share", "lahunv10", "lahunv10share", "lasnap20", "lasnap20share",  "lahunv20", "lahunv20share"]]
food_desert_df

Unnamed: 0,LILATracts_1And10,lasnaphalf,lasnaphalfshare,lahunvhalf,lahunvhalfshare,lasnap1,lasnap1share,lahunv1,lahunv1share,lasnap10,lasnap10share,lahunv10,lahunv10share,lasnap20,lasnap20share,lahunv20,lahunv20share
61529,0,325.133846,0.106149,48.548857,0.015850,319.907937,0.104443,47.889963,0.015635,5.159378,0.001684,0.922672,0.000301,0.0,0.0,0.0,0.0
61530,1,368.584869,0.140413,120.671667,0.045970,233.501673,0.088953,81.658810,0.031108,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
61531,0,590.025451,0.124636,100.788957,0.021290,549.545092,0.116085,84.946165,0.017944,48.577263,0.010261,8.752010,0.001849,0.0,0.0,0.0,0.0
61532,0,241.111748,0.077106,191.810180,0.061340,188.984455,0.060436,114.453080,0.036602,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
61533,0,307.709167,0.141932,144.460704,0.066633,307.703411,0.141930,144.458152,0.066632,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66673,0,253.269297,0.076911,147.346423,0.044745,200.157469,0.060783,94.092539,0.028574,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
66674,0,65.461476,0.027253,3.168106,0.001319,37.486129,0.015606,0.346386,0.000144,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
66675,1,24.931507,0.022260,6.618853,0.005910,24.931507,0.022260,6.618853,0.005910,15.243361,0.013610,2.243933,0.002004,0.0,0.0,0.0,0.0
66676,0,104.153688,0.108607,13.420352,0.013994,104.153688,0.108607,13.420352,0.013994,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [12]:
# print(food_desert_df)

## SQLite database

In [13]:
# SQLAlchemy Create Engine, to access and query the SQLite database file
engine = create_engine("sqlite:///food_desert.sqlite", echo=True)
sqlite_connection = engine.connect()

2021-01-07 13:28:46,131 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-01-07 13:28:46,135 INFO sqlalchemy.engine.base.Engine ()
2021-01-07 13:28:46,137 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-01-07 13:28:46,139 INFO sqlalchemy.engine.base.Engine ()


In [14]:
# to reflect the database into a new model
#Base = automap_base()
# to reflect the db tables 
#Base.prepare(engine, reflect=True)

### Create the database

In [30]:
# Create the database
conn = sqlite3.connect("DB1.sqlite")
cur = conn.cursor()

In [31]:
cur.execute('CREATE TABLE Food_desert_Austin (LILATracts_1And10, lasnaphalf, lasnaphalfshare, lahunvhalf, lahunvhalfshare, lasnap1, lasnap1share, lahunv1, lahunv1share, lasnap10, lasnap10share, lahunv10, lahunv10share, lasnap20, lasnap20share,  lahunv20, lahunv20share)')
conn.commit()

OperationalError: table Food_desert_Austin already exists

In [17]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [18]:
# Data from dataframe to SQLite
food_desert_df.to_sql('Food_desert_Austin', conn, if_exists='replace')

In [19]:
cur.execute('''SELECT * FROM Food_desert_Austin''')

<sqlite3.Cursor at 0x211801d2d50>

In [32]:
for row in cur.fetchall():
    print(row)

In [21]:
# From SQLite to DataFrame
# food_desert2_df = pd.DataFrame(cur.fetchall(), columns=['LILATracts_1And10', 'lasnaphalf', 'lasnaphalfshare', 'lahunvhalf', 'lahunvhalfshare', 'lasnap1', 'lasnap1share', 'lahunv1', 'lahunv1share', 'lasnap10', 'lasnap10share', 'lahunv10', 'lahunv10share', 'lasnap20', 'lasnap20share',  'lahunv20', 'lahunv20share'])
# print(food_desert2_df)

In [33]:
# Creating a new dataframe from the SQLite database table.
food_desert_Austin_df = pd.read_sql_query("SELECT * FROM Food_desert_Austin", conn)

In [34]:
food_desert_Austin_df

Unnamed: 0,index,LILATracts_1And10,lasnaphalf,lasnaphalfshare,lahunvhalf,lahunvhalfshare,lasnap1,lasnap1share,lahunv1,lahunv1share,lasnap10,lasnap10share,lahunv10,lahunv10share,lasnap20,lasnap20share,lahunv20,lahunv20share
0,61529,0,325.133846,0.106149,48.548857,0.015850,319.907937,0.104443,47.889963,0.015635,5.159378,0.001684,0.922672,0.000301,0.0,0.0,0.0,0.0
1,61530,1,368.584869,0.140413,120.671667,0.045970,233.501673,0.088953,81.658810,0.031108,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
2,61531,0,590.025451,0.124636,100.788957,0.021290,549.545092,0.116085,84.946165,0.017944,48.577263,0.010261,8.752010,0.001849,0.0,0.0,0.0,0.0
3,61532,0,241.111748,0.077106,191.810180,0.061340,188.984455,0.060436,114.453080,0.036602,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
4,61533,0,307.709167,0.141932,144.460704,0.066633,307.703411,0.141930,144.458152,0.066632,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,66673,0,253.269297,0.076911,147.346423,0.044745,200.157469,0.060783,94.092539,0.028574,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
346,66674,0,65.461476,0.027253,3.168106,0.001319,37.486129,0.015606,0.346386,0.000144,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
347,66675,1,24.931507,0.022260,6.618853,0.005910,24.931507,0.022260,6.618853,0.005910,15.243361,0.013610,2.243933,0.002004,0.0,0.0,0.0,0.0
348,66676,0,104.153688,0.108607,13.420352,0.013994,104.153688,0.108607,13.420352,0.013994,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [36]:
# Close the database connection
sqlite_connection.close()

In [35]:
# Export food_desert_Austin_df to csv
# food_desert_Austin_df.to_csv(food_desert_Austin.csv)

## Machine Learning Model, using Austin-Metro only data.

In [102]:
# Split the preprocessed dataframe from the SQLite database into our features and target arrays
#  Remove LILATracts_1And10 target from features data
y = food_desert_Austin_df.LILATracts_1And10.values
X = food_desert_Austin_df.drop(columns="LILATracts_1And10").values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [38]:
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
# Define the model - deep neural net. the number of input features and the hidden nodes for each layer.
# A good rule of thumb for a basic neural network is to have two to three times the amount of neurons in 
# the hidden layer as the number of inputs.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                540       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                310       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 861
Trainable params: 861
Non-trainable params: 0
_________________________________________________________________


In [40]:
# Compile and train the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [41]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: 0.2856 - accuracy: 0.8864
Loss: 0.2856280207633972, Accuracy: 0.8863636255264282


### Create a callback to save the model's weights, and
###   Save and export the results to an HDF5 file, Hierarchical Data Format

In [25]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

# Export our model to HDF5 file
nn.save("food_desert_austin.h5")

## Comparisons to other ML models

In [26]:
# Create a Random Forest classifier, 64 estimators.
rf_model = RandomForestClassifier(n_estimators=64, random_state=78)
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.875


In [27]:
# Logistic Regression Model Accuracy
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)
log_classifier.fit(X_train,y_train)
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.841


## NEW: 
### Setup ML Training on full U.S. dataset

In [42]:
# Create new df with select feature columns for all of U.S. (not only for Texas), with 72,864 rows
# Use this df as ML Training set
food_desertUS_df = food_atlas_df[["LILATracts_1And10", "lasnaphalf", "lasnaphalfshare", "lahunvhalf", "lahunvhalfshare", "lasnap1", "lasnap1share", "lahunv1", "lahunv1share", "lasnap10", "lasnap10share", "lahunv10", "lahunv10share", "lasnap20", "lasnap20share",  "lahunv20", "lahunv20share"]]
food_desertUS_df

Unnamed: 0,LILATracts_1And10,lasnaphalf,lasnaphalfshare,lahunvhalf,lahunvhalfshare,lasnap1,lasnap1share,lahunv1,lahunv1share,lasnap10,lasnap10share,lahunv10,lahunv10share,lasnap20,lasnap20share,lahunv20,lahunv20share
0,0,101.877398,0.147009,21.556248,0.031106,79.546843,0.114786,9.772855,0.014102,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0,127.367745,0.171424,58.628965,0.078908,41.700964,0.056125,21.638336,0.029123,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0,100.178387,0.079760,49.139711,0.039124,50.263422,0.040019,13.305612,0.010594,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0,67.713198,0.039322,17.525112,0.010177,24.401748,0.014171,8.782160,0.005100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0,339.070792,0.083065,129.596661,0.031748,119.946475,0.029384,44.657012,0.010940,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72859,0,117.960857,0.089568,56.892705,0.043199,93.350095,0.070881,36.650319,0.027829,35.810637,0.027191,13.101000,0.009948,29.946541,0.022738,12.621600,0.009584
72860,0,33.803746,0.029293,42.736232,0.037033,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
72861,0,35.425056,0.034696,28.268011,0.027687,6.895328,0.006754,5.380534,0.005270,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
72862,0,38.522234,0.029139,89.225175,0.067493,31.732410,0.024003,80.079983,0.060575,13.943352,0.010547,37.697144,0.028515,2.861406,0.002164,7.863209,0.005948


In [43]:
# export to csv for 
food_desertUS_df.to_csv("food_desertUS.csv")

In [None]:
# ML Model Setup

In [44]:
# Split the preprocessed dataframe from the database into our features and target arrays
#  Remove LILATracts_1And10 target from features data
y = food_desertUS_df.LILATracts_1And10.values
X = food_desertUS_df.drop(columns="LILATracts_1And10").values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [45]:
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [46]:
# Define the model - deep neural net. the number of input features and the hidden nodes for each layer.
# A good rule of thumb for a basic neural network is to have two to three times the amount of neurons in 
# the hidden layer as the number of inputs.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 30)                510       
_________________________________________________________________
dense_4 (Dense)              (None, 10)                310       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 831
Trainable params: 831
Non-trainable params: 0
_________________________________________________________________


In [47]:
# Compile and train the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [48]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

570/570 - 0s - loss: 0.1935 - accuracy: 0.9203
Loss: 0.19354204833507538, Accuracy: 0.9202898740768433


### Hierarchical Data Format file, HDF5. Using the Keras Sequential model's save method to export the entire model.

In [49]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

# Export our model to HDF5 file
nn.save("food_desertUS.h5")

## Supervised ML Models
### Easy Ensemble AdaBoost Classifier

In [54]:
# Train the EasyEnsembleClassifier.  EasyEnsembleClassifier(n_estimators=100, random_state=1)
adaboost = AdaBoostClassifier(n_estimators=1000, learning_rate=1,random_state=1)
model = adaboost.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [55]:
# Calculate the balanced accuracy score.
balanced_accuracy_score(y_test, y_pred)

0.7223996302146113

In [56]:
# Calculate the accuracy score - for Reference. 
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.9062911725955204

In [57]:
# Display the confusion matrix.
cm = confusion_matrix(y_test, y_pred)
cm

array([[15409,   496],
       [ 1211,  1100]], dtype=int64)

In [58]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
cm_df

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15409,496
Actual 1,1211,1100


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     15905
           1       0.69      0.48      0.56      2311

    accuracy                           0.91     18216
   macro avg       0.81      0.72      0.76     18216
weighted avg       0.90      0.91      0.90     18216



In [60]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.97      0.48      0.95      0.68      0.48     15905
          1       0.69      0.48      0.97      0.56      0.68      0.44      2311

avg / total       0.90      0.91      0.54      0.90      0.68      0.48     18216



In [61]:
print("Easy Ensemble AdaBoost Classifier")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_pred)}")

print(f"Confusion Matrix: ")
display(cm_df)

print("Imbalanced Classification Report: ")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Balanced Accuracy Score: 0.7223996302146113
Confusion Matrix: 


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15409,496
Actual 1,1211,1100


Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.97      0.48      0.95      0.68      0.48     15905
          1       0.69      0.48      0.97      0.56      0.68      0.44      2311

avg / total       0.90      0.91      0.54      0.90      0.68      0.48     18216



### Balanced Random Forest Classifier

In [62]:
# Resample the training data with the BalancedRandomForestClassifier. BalancedRandomForestClassifier(random_state=1)
rf_model = RandomForestClassifier(n_estimators=500, random_state=1) 

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [63]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [65]:
# Calculate the balanced accuracy score.
balanced_accuracy_score(y_test, predictions)

0.7650921450395584

In [66]:
# Display the confusion matrix.
cm = confusion_matrix(y_test, predictions)
cm

array([[15425,   480],
       [ 1016,  1295]], dtype=int64)

In [67]:
# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15425,480
Actual 1,1016,1295


In [68]:
print("Balanced Random Forest Classifier")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, predictions)}")

print(f"Confusion Matrix: ")
display(cm_df)

print("Imbalanced Classification Report: ")
print(classification_report_imbalanced(y_test, predictions))

Balanced Random Forest Classifier
Balanced Accuracy Score: 0.7650921450395584
Confusion Matrix: 


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15425,480
Actual 1,1016,1295


Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.97      0.56      0.95      0.74      0.57     15905
          1       0.73      0.56      0.97      0.63      0.74      0.52      2311

avg / total       0.91      0.92      0.61      0.91      0.74      0.56     18216



In [69]:
# Calculate feature importance in the Random Forest model.
importances = [rf_model.feature_importances_]
importances

[array([0.10992946, 0.17651169, 0.07687368, 0.10415869, 0.14577933,
        0.12945351, 0.08158382, 0.08668686, 0.02768087, 0.02474666,
        0.01392344, 0.01330562, 0.00262754, 0.00259767, 0.00208823,
        0.00205294])]

In [None]:
# List the features sorted in descending order by feature importance.
#sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
#importances = pd.DataFrame(rf_model.feature_importances_, index=X_train.columns, columns=['Importances']).sort_values('Importances', ascending=False)
#importances.head(15)

## Unsupervised ML Model
### Clustering

In [83]:
# Standardize the data with StandardScaler().
food_desertUS_scaled = StandardScaler().fit_transform(food_desertUS_df)
food_desertUS_scaled[0:5]

array([[-0.38120627, -0.22936622,  0.6216827 , -0.5688559 , -0.23832387,
         0.07343783,  0.97233821, -0.39222943, -0.14274953, -0.16540049,
        -0.16001909, -0.106915  , -0.10417744, -0.04867456, -0.05319576,
        -0.03048427, -0.03609774],
       [-0.38120627, -0.05054692,  0.88111632, -0.10733565,  0.56103532,
        -0.27103154,  0.14287336, -0.15060876,  0.27389754, -0.16540049,
        -0.16001909, -0.106915  , -0.10417744, -0.04867456, -0.05319576,
        -0.03048427, -0.03609774],
       [-0.38120627, -0.24128508, -0.09292675, -0.22546789, -0.10424193,
        -0.19309691, -0.08487134, -0.32029075, -0.24007193, -0.16540049,
        -0.16001909, -0.106915  , -0.10417744, -0.04867456, -0.05319576,
        -0.03048427, -0.03609774],
       [-0.38120627, -0.46903413, -0.52262568, -0.61903972, -0.58829122,
        -0.42848726, -0.45036164, -0.41240327, -0.39245635, -0.16540049,
        -0.16001909, -0.106915  , -0.10417744, -0.04867456, -0.05319576,
        -0.03048427

In [84]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
food_desertUS_pca = pca.fit_transform(food_desertUS_scaled)
food_desertUS_pca

array([[-0.19276156, -0.13698727, -0.55670197],
       [ 0.03380518, -0.41483848,  0.43571155],
       [-0.59941845,  0.28855718,  0.06431015],
       ...,
       [-1.17980865,  0.96835862,  0.13169114],
       [ 2.26821653,  1.81762241,  0.72511033],
       [-0.34451121,  0.02886395,  0.33930695]])

In [86]:
# Create a DataFrame with the three principal components.
food_desertUS_pca_df = pd.DataFrame(data=food_desertUS_pca, columns=["PC_1", "PC_2", "PC_3"], index=food_desertUS_df.index)
print(food_desertUS_pca_df.shape)
food_desertUS_pca_df.head(10)

(72864, 3)


Unnamed: 0,PC_1,PC_2,PC_3
0,-0.192762,-0.136987,-0.556702
1,0.033805,-0.414838,0.435712
2,-0.599418,0.288557,0.06431
3,-1.125297,0.901804,-0.150874
4,0.200256,-0.758917,0.261311
5,0.304109,-0.549474,0.10069
6,1.691509,-2.104328,0.314437
7,1.331953,-1.502026,0.627993
8,1.265558,-1.55584,-1.327826
9,4.467574,0.228183,-5.302108


In [87]:
# Fetch the explained variance
pca.explained_variance_ratio_

array([0.39161856, 0.24905182, 0.08560833])

### Clustering Crytocurrencies Using K-Means, Finding the Best Value for k Using the Elbow Curve

In [89]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(food_desertUS_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [90]:
# Running K-Means with k = 6
# Initialize the K-Means model.
model = KMeans(n_clusters=6, random_state=0)

# Fit the model
model.fit(food_desertUS_pca_df)

# Predict clusters
predictions = model.predict(food_desertUS_pca_df)
print(predictions.shape)
predictions

(72864,)


array([3, 0, 3, ..., 3, 0, 3])

### Visualizing Results, 3D-Scatter with Clusters

In [None]:
# Create a new DataFrame including predicted clusters and features.


In [101]:
# Creating a 3D-Scatter with the PCA data and the clusters, LILATracts_1And10
# Plotting the clusters with three features
#fig = px.scatter_3d(
#    food_desertUS_pca_df, 
#    x="PC_1", 
#    y="PC_2", 
#    z="PC_3", 
#    color="PC_1", 
#    hover_name="PC_1", 
#    hover_data=["PC_2"], 
#    symbol="PC_3", 
#    width=800
#)
#fig.update_layout(legend=dict(x=0,y=1))
#fig.show()

## Using the Training results from the U.S. data on the Austin-Metro only data
### including the Keras ModelCheckpoint method

In [134]:
# Import checkpoint dependencies
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

#### NN Machine Learning Model

In [135]:
# Split the preprocessed dataframe from the SQLite database into our features and target arrays
#  Remove LILATracts_1And10 target from features data
y = food_desert_Austin_df.LILATracts_1And10.values
X = food_desert_Austin_df.drop(columns="LILATracts_1And10").values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [136]:
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [137]:
# Define the model - deep neural net. the number of input features and the hidden nodes for each layer.
# A good rule of thumb for a basic neural network is to have two to three times the amount of neurons in 
# the hidden layer as the number of inputs.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 30)                540       
_________________________________________________________________
dense_31 (Dense)             (None, 10)                310       
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 11        
Total params: 861
Trainable params: 861
Non-trainable params: 0
_________________________________________________________________


In [138]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50,callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
3/3 - 0s - loss: 0.2652 - accuracy: 0.8864
Loss: 0.2652338743209839, Accuracy: 0.8863636255264282


In [139]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: 0.2652 - accuracy: 0.8864
Loss: 0.2652338743209839, Accuracy: 0.8863636255264282


In [140]:
# Export our model to HDF5 file
nn_new.save("food_desertUSnew.h5")

In [141]:
# Trying the saved h5 file to recreate and test for performance. 
# Import the model to a new object
nn_imported = tf.keras.models.load_model('food_desertUSnew.h5')

In [142]:
# Testing the use of the Keras Sequential model's load_weights method to restore the model weights.

# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  30
hidden_nodes_layer2 = 10

nn_new = tf.keras.models.Sequential()

# First hidden layer
nn_new.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn_new.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn_new.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the model
nn_new.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Restore the model weights
nn_new.load_weights("checkpoints/weights.86.hdf5")

# Evaluate the model using the test data
model_loss, model_accuracy = nn_new.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

OSError: Unable to open file (unable to open file: name = 'checkpoints/weights.86.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [143]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_new.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: 0.7199 - accuracy: 0.4318
Loss: 0.7198730707168579, Accuracy: 0.4318181872367859
