In [39]:
# Import libraries
import numpy as np
import pandas as pd
import sklearn as sk
import time
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras import initializers
from matplotlib import pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers
import keras_tuner as kt
from kerastuner.tuners import RandomSearch
from tensorflow import keras
from kerastuner.engine.hyperparameters import HyperParameters
import os
import imageio as iio
import shutil
import tensorflow as tf

In [17]:
df = pd.read_csv('Data.csv', encoding= 'unicode_escape')
# satellite data 
df_night_lights = pd.read_csv('nighttime_lights_data.csv', encoding= 'unicode_escape')

In [18]:
# Remove 3 raw data columns: Less than HS Diploma, HS Diploma, Associate's Degree
df = df.drop(['Less.than.a.high.school.diploma..2007.11','High.school.diploma.only..2007.11', 'Some.college.or.associate.s.degree..2007.11'], axis=1)
df.drop(columns = ['Unnamed: 0'], inplace=True)
# Remove 3 more columns: Civilian_labor_force_2013	Employed_2013	Unemployed_2013
df = df.drop(['Civilian_labor_force_2013',	"Employed_2013",	"Unemployed_2013"], axis=1)
# Binary variable: High Probability of Bachelors Degree (1) vs Low Probability of Bachelors Degree (0)
df.loc[df['Percent.of.adults.with.a.bachelor.s.degree.or.higher.2007.11'] <= 17.2, 'Binary Prob'] = 1
df.loc[df['Percent.of.adults.with.a.bachelor.s.degree.or.higher.2007.11'] > 17.2, 'Binary Prob'] = 0
# Remove high correlation columns from df: Percent.of.adults.with.less.than.a.high.school.diploma..2007.11	Percent.of.adults.with.a.high.school.diploma.only..2007.11	Percent.of.adults.completing.some.college.or.associate.s.degree..2007.1
df = df.drop(['Percent.of.adults.with.less.than.a.high.school.diploma..2007.11',
              'Percent.of.adults.with.a.high.school.diploma.only..2007.11',
              'Percent.of.adults.completing.some.college.or.associate.s.degree..2007.11', 
              'Bachelor.s.degree.or.higher..2007.11'], axis=1)

In [19]:
df['SNAP'] = df['SNAP'].str.replace(',', '').astype(float)
# to numeric exept for key
df[[x for x in df.columns if x !='State_County_Name']] = df[[x for x in df.columns if x !='State_County_Name']].apply(pd.to_numeric)
df = df.replace(np.nan, 0)
# Dropping County/State column
df2 = df.copy()

In [27]:
# merge with night time lights
data_model = pd.merge(df2, df_night_lights, on ='State_County_Name', how = 'left') 
data_model.dropna(inplace=True)
data_model.to_csv('data_model.csv')

# CNN

In [6]:
# parameters
img_height = 180
img_width = 180

In [122]:
# separate images into two folders, according to label
# excecute only once
ims_list = os.listdir("images\county_images")
N = len(ims_list)
n = 0
for im in ims_list:
    im_num = int(im[:-4])
    unique_data = data_model.countyns.unique()
    if im_num in unique_data:
        label = data_model[data_model.countyns == int(im[:-4])]['Binary Prob'].values[0]
        src = "images\county_images\{}".format(im)
        dst = "images\county_images_labeled\{}\{}".format(str(label)[0],im)
        shutil.copy2(src, dst)
    n+=1
    if n%50 ==0:
        print(n/N)

0.01546551190844417
0.03093102381688834
0.04639653572533251
0.06186204763377668
0.07732755954222084
0.09279307145066502
0.10825858335910919
0.12372409526755336
0.13918960717599752
0.15465511908444168
0.17012063099288585
0.18558614290133005
0.20105165480977422
0.21651716671821838
0.23198267862666255
0.24744819053510672
0.2629137024435509
0.27837921435199503
0.2938447262604392
0.30931023816888337
0.32477575007732756
0.3402412619857717
0.3557067738942159
0.3711722858026601
0.38663779771110424
0.40210330961954843
0.4175688215279926
0.43303433343643677
0.4484998453448809
0.4639653572533251
0.47943086916176925
0.49489638107021344
0.5103618929786576
0.5258274048871018
0.5412929167955459
0.5567584287039901
0.5722239406124343
0.5876894525208785
0.6031549644293226
0.6186204763377667
0.634085988246211
0.6495515001546551
0.6650170120630993
0.6804825239715434
0.6959480358799877
0.7114135477884318
0.7268790596968759
0.7423445716053202
0.7578100835137643
0.7732755954222085
0.7887411073306526
0.804206

In [37]:
#Load data using keras utils
def set_model_data(batch_size): 

    train_ds = tf.keras.utils.image_dataset_from_directory(
        "images\county_images_labeled",
        validation_split=0.2,
        subset="training",
        seed=1337,
        image_size=(img_height, img_width),
        batch_size=batch_size,
    )

    test_ds = tf.keras.utils.image_dataset_from_directory(
        "images\county_images_labeled",
        validation_split=0.2,
        subset="validation",
        seed=1337,
        image_size=(img_height, img_width),
        batch_size=batch_size,
    )
    return train_ds, test_ds

## MANUAL grid search for batch size, epochs, learning_rate 

In [8]:
num_classes = 2
#Build model 
def build_model(epochs, learning_rate,train_ds, test_ds):
    model = Sequential([
      layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)), #Standardize the data
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(num_classes)
    ])

    #Momentum Stochastic Gradient Descent 
    sgd = keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True)

    model.compile(optimizer=sgd,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])


    history = model.fit(
      train_ds,
      validation_data=test_ds,
      epochs=epochs)
    
    perf = model.evaluate(test_ds)
    print(perf)
    
    return perf
    

In [9]:
list_hyper = []
for batch_size in [50,100,200]:
    train_ds, test_ds = set_model_data(batch_size)
    for epochs in [5,10,15]:
        for learning_rate in [0.1, 0.001, 0.0001]: 
            perf = build_model(epochs, learning_rate,train_ds, test_ds)
            new_perf = [batch_size,epochs,learning_rate] +perf
            print(new_perf)
            list_hyper += [new_perf]

Found 3199 files belonging to 2 classes.
Using 2560 files for training.
Found 3199 files belonging to 2 classes.
Using 639 files for validation.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[0.6411923766136169, 0.627543032169342]
[200, 15, 0.001, 0.6411923766136169, 0.627543032169342]
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[0.678347110748291, 0.6118935942649841]
[200, 15, 0.0001, 0.678347110748291, 0.6118935942649841]


In [None]:
# results 
# [50, 5, 0.1, 0.6932075619697571, 0.4866979718208313],
# [50, 5, 0.001, 0.7048906087875366, 0.4976525902748108],
# [50, 5, 0.0001, 0.6763412952423096, 0.6071987748146057],
# [50, 10, 0.1, np.nan, 0.4866979718208313],
# [50, 10, 0.001, 0.6395555138587952, 0.6306729316711426],
# [50, 10, 0.0001, 0.6591957211494446, 0.6431924700737],
# [50, 15, 0.1, 0.6929979920387268, 0.5133020281791687],
# [50, 15, 0.001, 0.644537627696991, 0.6150234937667847],
# [50, 15, 0.0001, 0.6503545641899109, 0.6400625705718994],
# [100, 5, 0.1, 0.6928322911262512, 0.5133020281791687],
# [100, 5, 0.001, 0.6462869644165039, 0.6259781122207642],
# [100, 5, 0.0001, 0.6756465435028076, 0.6165884137153625],
# [100, 10, 0.1, 1.0789508819580078, 0.4866979718208313],
# [100, 10, 0.001, 0.6362870931625366, 0.6338028311729431],
# [100, 10, 0.0001, 0.6537830233573914, 0.6384976506233215],
# [100, 15, 0.1, 0.7040420770645142, 0.5133020281791687],
# [100, 15, 0.001, 0.6354638338088989, 0.6259781122207642],
# [100, 15, 0.0001, 0.6609286069869995, 0.6306729316711426],
# [200, 5, 0.1, np.nan, 0.4866979718208313],
# [200, 5, 0.001, 0.6624321937561035, 0.6338028311729431],
# [200, 5, 0.0001, 0.6879937648773193, 0.5492957830429077],
# [200, 10, 0.1, 0.6939690113067627, 0.4866979718208313],
# [200, 10, 0.001, 0.6502730250358582, 0.6181533932685852],
# [200, 10, 0.0001, 0.6720231175422668, 0.6150234937667847],
# [200, 15, 0.1, 0.6928034424781799, 0.5133020281791687],
# [200, 15, 0.0001, 0.678347110748291, 0.6118935942649841],
# [200, 15, 0.0001, 0.678347110748291, 0.6118935942649841]

## MANUAL grid search for activation function and initializer

In [None]:
initializer1 = initializers.RandomNormal(mean=0,stddev=1/np.sqrt(213),seed=0)
initializer2 = tf.keras.initializers.Constant()
initializer3 = tf.keras.initializers.Orthogonal()

activation = ['sigmoid','tanh','relu']

num_classes = 2
learning_rate = 0.01
epochs = 5

def train_model(activation,initializer):

    model = Sequential([
      layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)), #Standardize the data
      layers.Conv2D(32, 3, padding='same', activation=activation,kernel_initializer=initializer),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation=activation,kernel_initializer=initializer),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation=activation,kernel_initializer=initializer),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu',kernel_initializer=initializer),
      layers.Dense(num_classes)
    ])

    #Momentum Stochastic Gradient Descent 
    sgd = keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True)

    model.compile(optimizer=sgd,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])


    history = model.fit(
      train_ds,
      validation_data=test_ds,
      epochs=epochs)

    perf = model.evaluate(test_ds)
    print(perf)
    
    
    return perf

In [None]:
list_model_tun = []
for activation in ['sigmoid','tanh','relu']:
    perf = train_model(activation,initializer2)
    new_perf = [activation] +perf
    print(new_perf)
    list_model_tun += [new_perf]
    

In [None]:
list_model_tun = []
for init in [initializer1, initializer2,initializer3]:
    perf = train_model('sigmoid',init)
    new_perf = [activation] +perf
    print(new_perf)
    list_model_tun += [new_perf]
    
# Results   
#['RandomNormal', 0.6928526163101196, 0.5133020281791687]
#['Constant', 0.6931124925613403, 0.5133020281791687]
#['Orthogonal', 0.6930662989616394, 0.5133020281791687]

## MANUAL tuning for regularizer and dropout

In [None]:
from keras import regularizers
def train_model(activation,kernel_regularizer, dropout_rate):

    model = Sequential([
      layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)), #Standardize the data
      layers.Conv2D(32, 3, padding='same', activation=activation,kernel_regularizer= reg),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation=activation,kernel_regularizer= reg),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation=activation,kernel_regularizer= reg),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu',kernel_regularizer= reg),
      layers.Dense(num_classes)
    ])
    
    model.add(Dropout(dropout_rate, input_shape=(img_height, img_width, 3)))


    #Momentum Stochastic Gradient Descent 
    sgd = keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True)

    model.compile(optimizer=sgd,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])


    history = model.fit(
      train_ds,
      validation_data=test_ds,
      epochs=10)

    perf = model.evaluate(test_ds)
    print(perf)
    return perf

In [None]:
kernel_regularizer1=regularizers.l1(0.01)
kernel_regularizer2=regularizers.l2(0.01)

k = 1
for reg in [kernel_regularizer1,kernel_regularizer2]:
    perf = train_model('sigmoid',reg)
    new_perf = [k] +perf
    print(new_perf)
    list_model_tun += [new_perf]
    k+=1

In [None]:
for drop in [0.2,0.4,0.6]:
    perf = train_model('sigmoid',kernel_regularizer1,drop)
    new_perf = [k] +perf
    print(new_perf)
    list_model_tun += [new_perf]
    
# Results
#[0.2, 21.56456756591797, 0.4866979718208313],
# [0.4, 12.804211616516113, 0.4866979718208313],
# [0.6, 16.8350772857666, 0.5133020281791687]