# Alphabet Soup Charity Analysis - Optimization¶
## Compare the performance of the NN model to the Logit, SVM and RF models
### Preprocess the data based on earlier performance

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
#import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("charity_data.csv")
application_df = application_df.drop(['EIN', 'NAME'], axis=1)


# Create bins for rare occurrences in columns
# Application Type
apptype_counts = application_df.APPLICATION_TYPE.value_counts()
replace_application = list(apptype_counts[apptype_counts < 500].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app,"Other")

# Classification
classify_counts = application_df.CLASSIFICATION.value_counts()
replace_class = list(classify_counts[classify_counts < 1880].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls,"Other")


# Generate our categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()


# Use OHE to prepare data for analysis
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(application_cat)
encode_df.head()

# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(encode_df,left_index=True, right_index=True)
application_df = application_df.drop(application_cat,1)


# Split our preprocessed data into our features and target arrays
#y = application_df["IS_SUCCESSFUL"].values
#X = application_df.drop(["IS_SUCCESSFUL"],1).values
X = application_df.drop("IS_SUCCESSFUL", axis=1)
y = application_df["IS_SUCCESSFUL"]

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



### Logistic Regression Model

In [2]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train_scaled,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.466


### Support-Vector Machine Model

In [3]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.722


### Random Forest Model

In [4]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.714


### Comparison of Model Accuracy

| Model    | Accuracy |
|----------|----------|
| NN       |  0.73    |
| Logit    |  0.46    |
| SVM      |  0.72    |
| RF       |  0.71    |

### Evidence of similar performance for all except Logit model. </br> Consider feature reduction and other changes to optimize NN model.
Due to difficulty in determining feature importance using neural networks, review is based on random forest.

### Feature Importance

In [5]:
# Calculate feature importance in the original neural network model
feature_importances = rf_model.feature_importances_

In [6]:
features = sorted(zip(X.columns, feature_importances), key = lambda x: x[1])
features

[('USE_CASE_Other', 5.998670713090662e-05),
 ('AFFILIATION_Other', 7.922401334982439e-05),
 ('STATUS', 0.00019187529907672935),
 ('AFFILIATION_Regional', 0.00027046776542260614),
 ('ORGANIZATION_Corporation', 0.000440347530726473),
 ('SPECIAL_CONSIDERATIONS_N', 0.00047905932685652556),
 ('SPECIAL_CONSIDERATIONS_Y', 0.000565831397444202),
 ('AFFILIATION_National', 0.0005950601451037992),
 ('AFFILIATION_Family/Parent', 0.0012450414050965413),
 ('USE_CASE_Heathcare', 0.00132488682209223),
 ('INCOME_AMT_50M+', 0.0018870300072749112),
 ('APPLICATION_TYPE_Other', 0.00200787060912953),
 ('INCOME_AMT_5M-10M', 0.0023972628671981373),
 ('INCOME_AMT_10M-50M', 0.0027663595567476358),
 ('ORGANIZATION_Co-operative', 0.003076972694199374),
 ('INCOME_AMT_10000-24999', 0.003530476061946203),
 ('USE_CASE_CommunityServ', 0.0038208617481205087),
 ('APPLICATION_TYPE_T8', 0.004373322691349571),
 ('APPLICATION_TYPE_T7', 0.004866307915710914),
 ('INCOME_AMT_1-9999', 0.006102090930913049),
 ('INCOME_AMT_100000

# Optimization Attempt #1: Adjust Features
### Create dataset for optimization
##### In addition to dropping EIN and NAME, the following modifications were made:
* Dropped features include STATUS and SPECIAL CONSIDERATION, based on having <0.02 contribution to feature importance as defined by RF model.
* Bins were created for AFFILIATION and INCOME_AMT based on rare occurrences in columns (analysis steps not shown).

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("charity_data.csv")
application_df = application_df.drop(['EIN', 'NAME', 'STATUS', 'SPECIAL_CONSIDERATIONS'], axis=1)


# Create bins for rare occurrences in columns
# Application Type
apptype_counts = application_df.APPLICATION_TYPE.value_counts()
replace_application = list(apptype_counts[apptype_counts < 500].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app,"Other")

# Classification
classify_counts = application_df.CLASSIFICATION.value_counts()
replace_class = list(classify_counts[classify_counts < 1880].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls,"Other")

# Affiliation (review not shown)
afilly_counts = application_df.AFFILIATION.value_counts()
replace_affiliation = list(afilly_counts[afilly_counts <15000].index)
for filly in replace_affiliation:
    application_df.AFFILIATION = application_df.AFFILIATION.replace(filly,"Other")

# Income Amount
income_counts = application_df.INCOME_AMT.value_counts()
replace_income = list(income_counts[income_counts >0].index)
for income in income_counts:
    application_df.INCOME_AMT = application_df.INCOME_AMT.replace(income,"Other")


# Generate our categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()


# Use OHE to prepare data for analysis
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(application_cat)
encode_df.head()

# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(encode_df,left_index=True, right_index=True)
application_df = application_df.drop(application_cat,1)


# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(["IS_SUCCESSFUL"],1).values
#X = application_df.drop("IS_SUCCESSFUL", axis=1)
#y = application_df["IS_SUCCESSFUL"]

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



### Run the same NN model now that features have been modified

In [2]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    period=5)



In [3]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn_features = tf.keras.models.Sequential()
# First hidden layer
nn_features.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
nn_features.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
nn_features.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model
nn_features.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 304       
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 355
Trainable params: 355
Non-trainable params: 0
_________________________________________________________________


In [4]:
# Compile the model
nn_features.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_features.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn_features.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: saving model to checkpoints\weights.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: saving model to checkpoints\weights.10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 15: saving model to checkpoints\weights.15.hdf5
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: saving model to checkpoints\weights.20.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 25: saving model to checkpoints\weights.25.hdf5
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 30: saving model to checkpoints\weights.30.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 35: saving model to checkpoints\weights.35.hdf5
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 40: saving model to checkpoints\weights.40.hdf5
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 

### Accuracy of 73%; adjusting features did not optimize model performance.

In [5]:
# Export our model to HDF5 file
nn_features.save("AlphabetSoupCharity_featureAdj.h5")

## Optimization Attempt 2: Add More Hidden Layers

In [6]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5
hidden_nodes_layer3 = 10

nn_layers = tf.keras.models.Sequential()
nn_layers.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
nn_layers.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn_layers.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
nn_layers.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
nn_layers.summary()

# Compile the Sequential model together and customize metrics
nn_layers.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_layers.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn_layers.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 304       
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 10)                60        
                                                                 
 dense_6 (Dense)             (None, 1)                 11        
                                                                 
Total params: 420
Trainable params: 420
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: saving model to checkpoints\weights.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: savi

### Accuracy of 72%; adding a hidden layer did not optimize model performance.

In [7]:
# Export our model to HDF5 file
nn_layers.save("AlphabetSoupCharity_layerAdj.h5")

## Optimization Attempt 3: Increasing the Number of Epochs

In [8]:
## Adding a hidden layer did not improve model performance.

### Optimization Attempt 3: Increase the number of epochs

number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

# Define the basic neural network model
nn_epochs = tf.keras.models.Sequential()
nn_epochs.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
nn_epochs.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
nn_epochs.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
nn_epochs.summary()

# Compile the Sequential model together and customize metrics
nn_epochs.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_epochs.fit(X_train_scaled, y_train, epochs=150)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_epochs.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 8)                 304       
                                                                 
 dense_8 (Dense)             (None, 5)                 45        
                                                                 
 dense_9 (Dense)             (None, 1)                 6         
                                                                 
Total params: 355
Trainable params: 355
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoc

### Accuracy of 73%; increasing the number of epochs did not optimize model peformance.

In [9]:
# Export the model to HDF5 file
nn_epochs.save("AlphabetSoupCharity_epochsAdj.h5")

### CONCLUSION: When comparing NN, RF and SVM performance on this dataset, RF is the preferred model as similar accuracy is achieved in the shortest amount of time. Additional steps should be taken to see if accuracy can be further improved using the RF model.