## UTMCC DataViz Module 20 Team Project --  
## Neural Network Machine Deep Learning Model  
### Food Deserts in the Austin, Texas Metro Area

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy import stats
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations
# Import checkpoint dependencies
import os
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame

from pathlib import Path
from collections import Counter
import datetime as dt
import calendar
import random
from path import Path

import io
import sys
import psycopg2
import csv
import codecs
import boto3
import itertools

%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
import matplotlib.pyplot as plt
import statistics
# from flask import Flask, jsonify
# from mpl_toolkits.mplot3d import Axes3D
# from hvplot import hvPlot
# import hvplot.pandas
# import plotly.express as px

# Python SQL toolkit and Object Relational Mapper
import sqlite3
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, select, delete, Table
from sqlalchemy import extract

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'psycopg2'

### Import and read csv file from AWS S3 Bucket using Boto 3 and Pandas

In [None]:
# Download csv files from AWS S3 and create a pandas dataframe 

client = boto3.client('s3', 'us-east-2', aws_access_key_id='xxxxxxxx', 
                                  aws_secret_access_key='xxxxxxxxxy')

obj = client.get_object(Bucket= "dataviz20-bucket", Key= "food_access_research_atlas.csv") 

food_atlas_df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')
food_atlas_df.head()

In [None]:
client = boto3.client('s3', 'us-east-2', aws_access_key_id='xxxxxxxx', 
                                  aws_secret_access_key='xxxxxxxxy')

obj = client.get_object(Bucket= "dataviz20-bucket", Key= "food_desert_austin_censustract.csv") 

fooddesert_austin_censustract_df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')
# fooddesert_austin_censustract_df

In [None]:
client = boto3.client('s3', 'us-east-2', aws_access_key_id='xxxxxxxx', 
                                  aws_secret_access_key='xxxxxxxxy')

obj = client.get_object(Bucket= "dataviz20-bucket", Key= "census_tract_shapefiles_all.csv") 

census_tract_shapefiles_all_df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')
# census_tract_shapefiles_all_df

In [None]:
# Rename column GEOID to CensusTract
census_tract_shapefiles_all_df = census_tract_shapefiles_all_df.rename(columns={"GEOID":"CensusTract"})
# census_tract_shapefiles_all_df

In [None]:
# Import and read csv files
# food_atlas_df = pd.read_csv("resources/food_access_research_atlas.csv")

In [None]:
# food_atlas_df.head()

## Preprocessing the data

In [None]:
food_atlas_df.dtypes

In [None]:
# Checking for missing values using isnull()
food_atlas_df.isnull()

In [None]:
# Drop rows that may have null, missing values.
rows_len_nan_check = food_atlas_df.dropna(how='all')
rows_len_nan_check

In [None]:
# Compare sizes of the dataframes to indicate how many rows had a minimum of one null value. 
print("Old data frame length:", len(food_atlas_df)) 
print("New data frame length:", len(rows_len_nan_check))  
print("Number of rows with at least 1 NA value: ", (len(food_atlas_df)-len(rows_len_nan_check))) 

In [None]:
# Create new df keeping only Texas
food_texas_df = food_atlas_df[(food_atlas_df["State"]=="Texas")]
food_texas_df

In [None]:
# Create new df keeping only select Counties in the Austin Metro Area
counties = ["Bastrop", "Caldwell", "Hays", "Travis", "Williamson"]
food_austin_df = food_texas_df.loc[food_texas_df["County"].isin(counties)]
food_austin_df

In [None]:
# Create a file for visualization, food desert tracts for Austin Metro Area  food_austin_df.loc
# aus_desert_tracts = ["CensusTract", "LILATracts_1And10"]
LILATracts_1And10_aus_df = food_austin_df[["CensusTract", "LILATracts_1And10"]]
LILATracts_1And10_aus_df

In [None]:
# export to csv 
# LILATracts_1And10_aus_df.to_csv("LILATracts_1And10_aus.csv")

In [None]:
# Begin Income column creation (target variable), MFI = MedianFamilyIncome']
food_austinMFI_df = food_austin_df
food_austinMFI_df['MedianFamilyIncome']

In [None]:
# Poverty Guidelines U.S. 2015 family of 4 is $24250
conditions = [(food_austinMFI_df['MedianFamilyIncome'] <= 24250), 
              (food_austinMFI_df['MedianFamilyIncome'] > 24250)]
values = [0, 1]
food_austinMFI_df["Income"] = np.select(conditions, values)
food_austinMFI_df.head()

In [None]:
food_austinMFI_df.dtypes

In [None]:
# export to csv 
# food_austinMFI_df.to_csv("food_desert_austinMFI.csv")

In [None]:
# Create new df with select feature columns representing "share" values
food_desert_Austin_df = food_austinMFI_df[["Income", "lasnaphalfshare", "lahunvhalfshare", "lasnap1share", "lahunv1share", "lasnap10share", "lahunv10share", "lasnap20share", "lahunv20share"]]
food_desert_Austin_df

In [None]:
# export to csv 
# food_desert_Austin_df.to_csv("food_desert_Austin.csv")

### Data preprocessing for: ML Training on full U.S. Census dataset

In [None]:
# Data Preparation for full U.S. dataset
# Begin Income column creation (target variable), MFI = MedianFamilyIncome
food_atlasMFI_df = food_atlas_df
food_atlasMFI_df['MedianFamilyIncome']

In [None]:
# Poverty Guidelines U.S. 2015 family of 4 is $24,250
conditions = [(food_atlasMFI_df['MedianFamilyIncome'] <= 24250), 
              (food_atlasMFI_df['MedianFamilyIncome'] > 24250)]
values = [0, 1]
food_atlasMFI_df["Income"] = np.select(conditions, values)
food_atlasMFI_df

In [None]:
food_atlasMFI_df.dtypes

In [None]:
# export to csv 
# food_atlasMFI_df.to_csv("food_atlasMFI.csv")

In [None]:
# Create new df with select feature columns for all of U.S. (not only for Texas), with 72,864 rows
# Use this df as ML Training set, this input data has no categorical data types, it can be provided to the neural network model in its raw form 
food_desertUS_df = food_atlasMFI_df[["Income", "lasnaphalfshare", "lahunvhalfshare", "lasnap1share", "lahunv1share", "lasnap10share", "lahunv10share", "lasnap20share", "lahunv20share"]]
food_desertUS_df

In [None]:
# export to csv
# food_desertUS_df.to_csv("food_desertUS.csv")

In [None]:
# Create a OneHotEncoder instance for column Income, and although the Income data value are numerical, not categorical, 
#  to ensure that the values are encoded for the ML model.
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(food_desertUS_df.Income.values.reshape(-1,1)))
encode_df.head(5)

In [None]:
encode_df.dtypes

In [None]:
# export to csv
# encode_df.to_csv("encode.csv")

## Machine Learning Models
### Training: on full U.S. Census dataset

In [None]:
# Split the preprocessed dataframe into our features and target arrays
#  Remove Income target from features data. Establish the target output, y, as the encoded Income column for "1".
#   The two columns of the endoce_df are redundant to each other, as they are dichotomous, we only need one of the colunns.
y = encode_df[1]
# y = food_desertUS_df["Income"]
X = food_desertUS_df.drop(columns="Income").values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# Create StandardScaler instance. Using the StandardScaler module to standardize our numerical variables, we reduce the overall
#  likelihood that outliers, variables of different units, or skewed distributions will have a negative impact on the model's performance.
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net. the number of input features and the hidden nodes for each layer.
# A rule of thumb for a basic neural network is to have two to three times the amount of neurons in 
# the hidden layer as the number of inputs.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [None]:
# Compile and train the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every 5 epochs. Checkpoints will be saved every thousand samples tested (across all epochs).
# Using the Keras ModelCheckpoint method
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=1000)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

In [None]:
# Evaluate the model using the test data, on the full U.S. Training set.
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy*100} %")

In [None]:
# Hierarchical Data Format file, HDF5. Using the Keras Sequential model's save method to export the model.
# Export our model to HDF5 file
nn.save("trained_food_desertUS.h5")

### Creating Confusion Matrix for Training Data - full U.S. Census Tracts

In [None]:
# Create predictions to compare in the CM
predictions_train= nn.predict(X_train)
predictions_test= nn.predict(X_test)
# Check shape of test array
predictions_test.shape

In [None]:
predictions_test

In [None]:
# Round predicions into binaries for comparisons to y data
pred_test_round = predictions_test.round()
pred_test_round

In [None]:
# Check shape of array
y_test.shape

In [None]:
# Reshape array into 1-D
y_test_reshape = y_test.values.reshape(18216,1)
y_test_reshape.shape

In [None]:
# Recheck shape of array
y_test_reshape

In [None]:
# Accuracy score to verify CM findings
acc_score= accuracy_score(y_test_reshape, pred_test_round, normalize = False)
acc_score

In [None]:
print(confusion_matrix (y_test_reshape, pred_test_round))

In [None]:
cm = confusion_matrix (y_test_reshape, pred_test_round)

In [None]:
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm_plot_labels = ['Below Poverty Level','Above Poverty Level']
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')

## Testing: Using the Training results from the full U.S. Trained weights on the Austin-Metro only data


#### NN Machine Learning Model

In [None]:
# Fit and transform the OneHotEncoder using the categorical variable list
encodeAus_df = pd.DataFrame(enc.fit_transform(food_desert_Austin_df.Income.values.reshape(-1,1)))
encodeAus_df.head(5)

In [None]:
# Split the preprocessed dataframe into our features and target arrays
#  Remove Income target from features data
y = encodeAus_df[1]
X = food_desert_Austin_df.drop(columns="Income").values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# Create StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net. the number of input features and the hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 20
hidden_nodes_layer2 = 10

nn_new = tf.keras.models.Sequential()

# First hidden layer
nn_new.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn_new.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn_new.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_new.summary()

In [None]:
# Restore the model weights
nn_new = tf.keras.models.load_model('trained_food_desertUS.h5')

# Compile the model
nn_new.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
# fit_model = nn_new.fit(X_train_scaled,y_train,epochs=50)

# Evaluate the model using the test data,on the Austin-Metro only Test set.
model_loss, model_accuracy = nn_new.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy*100} %")

#### Hierarchical Data Format file, HDF5. Using the Keras Sequential model's save method to export the model.

In [None]:
# Export the new model to HDF5 file
nn_new.save("trained_food_desert_Austin.h5")

In [None]:
# Trying the saved h5 file to recreate, check, and test for performance. 
# Import the model to a new object
nn_new_imported = tf.keras.models.load_model('trained_food_desert_Austin.h5')

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_new.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy*100} %")

### Configure connection and engine for AWS RDS with SQLAlchemy, and 
###   Writing to the PostgreSQL database: Module20_food_deserts

In [None]:
# Configure connection and engine for AWS RDS with SQLAlchemy.
connection = psycopg2.connect(
    host = 'dataviz.c5qcqhh5xq62.us-east-2.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = 'xxxxx',
    database= 'Module20_food_deserts'
    )
cursor=connection.cursor()

In [None]:
# Creat engine for postgresql.
engine = create_engine('postgresql://postgres:Data1UT$@dataviz.c5qcqhh5xq62.us-east-2.rds.amazonaws.com/Module20_food_deserts')
con = engine.connect()

In [None]:
LILATracts_1And10_aus_df.to_sql('LILATracts_1And10_aus', engine, if_exists='replace')

In [None]:
food_austinMFI_df.to_sql('food_austinMFI', engine, if_exists='replace')

In [None]:
food_desert_Austin_df.to_sql('food_desert_Austin', engine, if_exists='replace')

In [None]:
food_atlasMFI_df.to_sql('food_atlasMFI', engine, if_exists='replace')

In [None]:
food_desertUS_df.to_sql('food_desertUS', engine, if_exists='replace')

In [None]:
# LILATracts_1And10_aus_df
# food_austinMFI_df
# food_desert_Austin_df
# food_atlasMFI_df
# food_desertUS_df

In [None]:
# Join two of the DataFrames
fooddesert_austin_censusshapes_df = fooddesert_austin_censustract_df.merge(census_tract_shapefiles_all_df, on="CensusTract", how="inner")
#fooddesert_austin_censusshapes_df.head(5)

In [None]:
# SQL LEFT JOIN 
sql = "SELECT \
   fa.CensusTract, \
    fa.LowIncomeTracts, \
    fa.PovertyRate, \
    fa.MedianFamilyIncome, \
    fa.Income, \
    li.LILATracts_1And10 \
FROM food_austinMFI AS fa \
LEFT JOIN LILATracts_1And10_aus AS li ON fa.CensusTract = li.CensusTract"

In [None]:
# cursor.execute(sql)
# myresult = cursor.fetchall()
# for x in myresult:
#    print(x)

### Creating Confusion Matrix for Test Data - Austin Metro Area Food Deserts

In [None]:
# Create predictions to compare in the CM
predictions_train= nn_new.predict(X_train)
predictions_test= nn_new.predict(X_test)
# Check shape of test array
predictions_test.shape

In [None]:
#View state of prediction data
predictions_test

In [None]:
#Round predictions into binaries for comparisons to y data
pred_test_round = predictions_test.round()

In [None]:
#Check shape of array
y_test.shape

In [None]:
#Reshape array into 1-D
y_test_reshape = y_test.values.reshape(88,1)
y_test_reshape.shape

In [None]:
#Recheck shape of array
y_test_reshape

In [None]:
#Accuracy score to verify CM findings
acc_score= accuracy_score(y_test_reshape, pred_test_round, normalize = False)
acc_score

In [None]:
print(confusion_matrix (y_test_reshape, pred_test_round))

In [None]:
cm = confusion_matrix (y_test_reshape, pred_test_round)

In [None]:
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm_plot_labels = ['Below Poverty Level','Above Poverty Level']

In [None]:
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')