In [1]:
# Dependencies and data.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

df = pd.read_csv("Resources/charity_data.csv")
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


### Step 1 - Do Not Drop Name Column

We noticed that there are multiple applications from the same companies, so we will try to bin these instead of dropping them.

In [2]:
# Drop EIN column.
df = df.drop(['EIN'], 1)
# Multiple same-name values, we should investigate bucketing them.

# Bucket APPLICATION_TYPE variable.
app_counts = df.APPLICATION_TYPE.value_counts()
replace_application = list(app_counts[app_counts < 500].index)
for app in replace_application:
    df.APPLICATION_TYPE = df.APPLICATION_TYPE.replace(app,"Other")

# Bucket CLASSIFICATION variable.
class_counts = df.CLASSIFICATION.value_counts()
replace_class = list(class_counts[class_counts < 1000].index)
for cls in replace_class:
    df.CLASSIFICATION = df.CLASSIFICATION.replace(cls,"Other")

We choose to bin any names that appear less than 2 times (so any non-repeating name), and put them in an "OTHER" category.

In [3]:
# Bucket NAME variable.
name_counts = df.NAME.value_counts()
replace_name = list(name_counts[name_counts < 2].index)
for name in replace_name:
    df.NAME = df.NAME.replace(name, "OTHER")
# Check success at binning.
df.NAME.value_counts()

OTHER                                               18776
PARENT BOOSTER USA INC                               1260
TOPS CLUB INC                                         765
UNITED STATES BOWLING CONGRESS INC                    700
WASHINGTON STATE UNIVERSITY                           492
                                                    ...  
PARENT BOOSTERS USA INC                                 2
AMERICAN ART THERAPY ASSOCIATION                        2
HEARING LOSS ASSOCIATION OF AMERICA                     2
AMERICAN COLLEGE OF PHYSICIANS                          2
NATIONAL SOCIETY UNITED STATES DAUGHTERS OF 1812        2
Name: NAME, Length: 793, dtype: int64

### Step 2 - Bin Income Amount Variable
We turns this variable into a yes/no binary classifier, either they have income (>0) or they don't.

In [4]:
# Bucket INCOME_AMT variable.
inc_counts = df.INCOME_AMT.value_counts()
replace_income = list(inc_counts[inc_counts > 0].index)
for income in replace_income:
    df.INCOME_AMT = df.INCOME_AMT.replace(income, 1)
# Check success at binning.
df.INCOME_AMT.value_counts()

1    34299
Name: INCOME_AMT, dtype: int64

In [5]:
# Encode the categorical variables, and merge with main DF.
cats = df.dtypes[df.dtypes == 'object'].index.tolist()

enc = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(enc.fit_transform(df[cats]))
encode_df.columns = enc.get_feature_names(cats)

df = df.merge(encode_df, left_index=True, right_index=True).drop(cats, 1)

In [6]:
# Split target variables and dataset into training-testing.
y = df.IS_SUCCESSFUL
X = df.drop('IS_SUCCESSFUL', 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Import checkpoint dependencies.
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames.
os.makedirs("checkpoints_optimized/",exist_ok=True)
checkpoint_path = "checkpoints_optimized/weights.{epoch:02d}.hdf5"

# Create the callback Checkpoint function.
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=4020
)
# Each epoch is 804 batches => 5*804=4020 batches to checkpoint for 5 epochs.

### Step 3 - Different Layers, Neurons, and Activations
We add a third hidden layer, and increase the number of nodes (neurons) each layer has. We make use of the tanh activation function, and increase the number of epochs from 50 to 100.

In [9]:
# Define model inputs.
number_input_features = len(X_train_scaled[0])
nodes_l1 = 100
nodes_l2 = 50
nodes_l3 = 20

nn = tf.keras.models.Sequential()

# Add hidden layers (3) and output layer.
nn.add(tf.keras.layers.Dense(units=nodes_l1, input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=nodes_l2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=nodes_l3, activation='tanh'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: saving model to checkpoints_optimized\weights.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: saving model to checkpoints_optimized\weights.10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: saving model to checkpoints_optimized\weights.15.hdf5
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: saving model to checkpoints_optimized\weights.20.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: saving model to checkpoints_optimized\weights.25.hdf5
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 00030: saving model to checkpoints_optimized\weights.30.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 00035: saving model to checkpoints_optimized\weights.35.hdf5
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 00040: saving 

Epoch 64/100
Epoch 65/100
Epoch 00065: saving model to checkpoints_optimized\weights.65.hdf5
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 00070: saving model to checkpoints_optimized\weights.70.hdf5
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 00075: saving model to checkpoints_optimized\weights.75.hdf5
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 00080: saving model to checkpoints_optimized\weights.80.hdf5
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 00085: saving model to checkpoints_optimized\weights.85.hdf5
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 00090: saving model to checkpoints_optimized\weights.90.hdf5
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 00095: saving model to checkpoints_optimized\weights.95.hdf5
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Epoch 00100: saving model to checkpoints_optimize

<tensorflow.python.keras.callbacks.History at 0x28e93349f48>

### Model Outcome
The model ends with an accuracy of 81%, higher than the required 75%.

In [10]:
# Save the whole model. - Export the model to an HDF5 model.
nn.save('AlphabetSoupCharity_Optimzation.h5')

In [11]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.4478 - accuracy: 0.7945
Loss: 0.44784802198410034, Accuracy: 0.7945189476013184
