In [1]:
!pip install keras-tuner



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from keras.callbacks import ModelCheckpoint

# Step 1: Preprocess the data

In [3]:
df = pd.read_csv('Resources/charity_data.csv')
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
df2=df.drop(['EIN', 'NAME'], axis = 1)
df2

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [5]:
df2.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [6]:
df2.groupby(df2['APPLICATION_TYPE']).count().sort_values('AFFILIATION', ascending=False)

Unnamed: 0_level_0,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
APPLICATION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
T3,27037,27037,27037,27037,27037,27037,27037,27037,27037
T4,1542,1542,1542,1542,1542,1542,1542,1542,1542
T6,1216,1216,1216,1216,1216,1216,1216,1216,1216
T5,1173,1173,1173,1173,1173,1173,1173,1173,1173
T19,1065,1065,1065,1065,1065,1065,1065,1065,1065
T8,737,737,737,737,737,737,737,737,737
T7,725,725,725,725,725,725,725,725,725
T10,528,528,528,528,528,528,528,528,528
T9,156,156,156,156,156,156,156,156,156
T13,66,66,66,66,66,66,66,66,66


In [7]:
# Changed to take top 10 categories of application type
top10=df['APPLICATION_TYPE'].value_counts()[:10].index
df2.loc[~df2['APPLICATION_TYPE'].isin(top10), 'APPLICATION_TYPE'] = 'Other'
df2.nunique()

APPLICATION_TYPE            11
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [8]:
df2.groupby(df2['APPLICATION_TYPE']).count().sort_values('AFFILIATION', ascending=False)

Unnamed: 0_level_0,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
APPLICATION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
T3,27037,27037,27037,27037,27037,27037,27037,27037,27037
T4,1542,1542,1542,1542,1542,1542,1542,1542,1542
T6,1216,1216,1216,1216,1216,1216,1216,1216,1216
T5,1173,1173,1173,1173,1173,1173,1173,1173,1173
T19,1065,1065,1065,1065,1065,1065,1065,1065,1065
T8,737,737,737,737,737,737,737,737,737
T7,725,725,725,725,725,725,725,725,725
T10,528,528,528,528,528,528,528,528,528
T9,156,156,156,156,156,156,156,156,156
T13,66,66,66,66,66,66,66,66,66


In [9]:
df2.groupby(df2['CLASSIFICATION']).count().sort_values('AFFILIATION', ascending=False).head(20)

Unnamed: 0_level_0,APPLICATION_TYPE,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C1000,17326,17326,17326,17326,17326,17326,17326,17326,17326
C2000,6074,6074,6074,6074,6074,6074,6074,6074,6074
C1200,4837,4837,4837,4837,4837,4837,4837,4837,4837
C3000,1918,1918,1918,1918,1918,1918,1918,1918,1918
C2100,1883,1883,1883,1883,1883,1883,1883,1883,1883
C7000,777,777,777,777,777,777,777,777,777
C1700,287,287,287,287,287,287,287,287,287
C4000,194,194,194,194,194,194,194,194,194
C5000,116,116,116,116,116,116,116,116,116
C1270,114,114,114,114,114,114,114,114,114


In [10]:
# Changed to take top 10 and then 20 classification types
top10=df['CLASSIFICATION'].value_counts()[:10].index
df2.loc[~df2['CLASSIFICATION'].isin(top10), 'CLASSIFICATION'] = 'Other'
df2.nunique()

APPLICATION_TYPE            11
AFFILIATION                  6
CLASSIFICATION              11
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [11]:
df2.groupby(df2['CLASSIFICATION']).count().sort_values('AFFILIATION', ascending=False).head(20)

Unnamed: 0_level_0,APPLICATION_TYPE,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C1000,17326,17326,17326,17326,17326,17326,17326,17326,17326
C2000,6074,6074,6074,6074,6074,6074,6074,6074,6074
C1200,4837,4837,4837,4837,4837,4837,4837,4837,4837
C3000,1918,1918,1918,1918,1918,1918,1918,1918,1918
C2100,1883,1883,1883,1883,1883,1883,1883,1883,1883
C7000,777,777,777,777,777,777,777,777,777
Other,773,773,773,773,773,773,773,773,773
C1700,287,287,287,287,287,287,287,287,287
C4000,194,194,194,194,194,194,194,194,194
C5000,116,116,116,116,116,116,116,116,116


In [12]:
df3 = pd.get_dummies(df2)

In [13]:
df3.head(100)

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T13,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,5000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
96,1,5000,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
97,1,5000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
98,1,5000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
# Remove successful target from features data
y = df3.IS_SUCCESSFUL.values
X = df3.drop(columns = "IS_SUCCESSFUL").values

In [15]:
X[0:1]

array([[   1, 5000,    0,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    1,    0,    0,    0,    1,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    1,    1,    0,    0,    0,    1,    0,    0,    0,    0,
           0,    0,    0,    0,    1,    0]], dtype=int64)

In [16]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Optimizing the model

In [17]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    

    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=50))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 7)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [18]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective='val_accuracy',
    max_epochs=20,
    hyperband_iterations=2)

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


In [19]:
# Create the callback that saves the model's weights every 5 epochs
checkpoint = ModelCheckpoint("AlphabetSoupCharity_Optimization.hdf5", encoding='utf-8', monitor='loss', verbose=1,
save_best_only=True, mode='auto', save_freq=5)
    
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test), callbacks = [checkpoint])

INFO:tensorflow:Oracle triggered exit


In [20]:
# Evaluate the top 3 models against the test dataset
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.0000e+00 - accuracy: 0.7356
Loss: 0.0, Accuracy: 0.7356268167495728
268/268 - 0s - loss: 0.0000e+00 - accuracy: 0.7357
Loss: 0.0, Accuracy: 0.7357434630393982
268/268 - 0s - loss: 0.0000e+00 - accuracy: 0.7339
Loss: 0.0, Accuracy: 0.7338775396347046


In [21]:
# Get second best model hyperparameters
second_hyper = tuner.get_best_hyperparameters(2)[1]
second_hyper.values

{'activation': 'sigmoid',
 'first_units': 9,
 'num_layers': 5,
 'units_0': 9,
 'units_1': 5,
 'units_2': 3,
 'units_3': 5,
 'units_4': 9,
 'units_5': 5,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 1,
 'tuner/round': 0}

In [22]:
# Compare the performance to the second-best model
second_model = tuner.get_best_models(2)[1]
model_loss, model_accuracy = second_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.0000e+00 - accuracy: 0.7357
Loss: 0.0, Accuracy: 0.7357434630393982
