## Preprocessing

In [286]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [287]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=["EIN", "NAME","STATUS","SPECIAL_CONSIDERATIONS","USE_CASE",])

## Optimization added, removed outliers for ask_amt

In [288]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = application_df['ASK_AMT'].quantile(0.25)
Q3 = application_df['ASK_AMT'].quantile(0.75)
IQR = Q3 - Q1

# Determine the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = application_df[(application_df['ASK_AMT'] < lower_bound) | (application_df['ASK_AMT'] > upper_bound)]

# Print the outliers
print("Outliers in ASK_AMT:")
print(outliers[['ASK_AMT']])

# Remove outliers from the DataFrame
application_df_cleaned = application_df[(application_df['ASK_AMT'] >= lower_bound) & (application_df['ASK_AMT'] <= upper_bound)]

application_df = application_df_cleaned

# Check the shape of the new DataFrame to see how many rows were removed
print(f"Original DataFrame shape: {application_df.shape}")
print(f"Cleaned DataFrame shape: {application_df_cleaned.shape}")


Outliers in ASK_AMT:
        ASK_AMT
1        108590
4        142590
6         31452
7       7508025
8         94389
...         ...
34280    401661
34281     56261
34284   3384856
34289     20197
34298  36500179

[8206 rows x 1 columns]
Original DataFrame shape: (26093, 7)
Cleaned DataFrame shape: (26093, 7)


In [289]:
print(application_df.describe())

# Display a sample of the DataFrame to visualize the data
print(application_df.sample(10))

            ASK_AMT  IS_SUCCESSFUL
count  26093.000000   26093.000000
mean    5082.225348       0.526425
std      593.637940       0.499311
min     5000.000000       0.000000
25%     5000.000000       0.000000
50%     5000.000000       1.000000
75%     5000.000000       1.000000
max    11854.000000       1.000000
      APPLICATION_TYPE       AFFILIATION CLASSIFICATION ORGANIZATION  \
1832                T3  CompanySponsored          C1000        Trust   
23006               T5  CompanySponsored          C3000  Association   
16788               T3  CompanySponsored          C1200        Trust   
3700                T3  CompanySponsored          C1000  Association   
13052               T3       Independent          C1000        Trust   
22388               T3  CompanySponsored          C2000        Trust   
20808               T3       Independent          C4000        Trust   
19533               T3  CompanySponsored          C1200  Association   
9199                T3       Independ

In [290]:
# Determine the number of unique values in each column.
application_df.nunique()

APPLICATION_TYPE     12
AFFILIATION           6
CLASSIFICATION       65
ORGANIZATION          4
INCOME_AMT            9
ASK_AMT             656
IS_SUCCESSFUL         2
dtype: int64

In [291]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_type_counts = application_df.APPLICATION_TYPE.value_counts()
application_type_counts

APPLICATION_TYPE
T3     20086
T4      1331
T19      999
T5       894
T6       882
T8       669
T7       633
T10      508
T13       54
T9        18
T12       13
T2         6
Name: count, dtype: int64

In [292]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_type_counts[application_type_counts < 100].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3     20086
T4      1331
T19      999
T5       894
T6       882
T8       669
T7       633
T10      508
T13       54
T9        18
T12       13
T2         6
Name: count, dtype: int64

In [293]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df.CLASSIFICATION.value_counts()
classification_counts

CLASSIFICATION
C1000    12557
C2000     4693
C1200     3997
C2100     1623
C3000     1537
         ...  
C1370        1
C1248        1
C6100        1
C3200        1
C2150        1
Name: count, Length: 65, dtype: int64

In [294]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_counts[classification_counts > 1]

CLASSIFICATION
C1000    12557
C2000     4693
C1200     3997
C2100     1623
C3000     1537
C7000      546
C1700      223
C4000      129
C5000      106
C1270       89
C2700       75
C7100       62
C2800       61
C1280       46
C1300       42
C1230       35
C2300       28
C1240       28
C1400       27
C7200       20
C6000       14
C8000       13
C1250       13
C7120       11
C1278       10
C1235        9
C1237        9
C8200        9
C1238        9
C1500        7
C1720        6
C1257        5
C7210        5
C1600        4
C2400        4
C4100        4
C1260        3
C1800        3
C1246        2
C1267        2
C1256        2
C0           2
Name: count, dtype: int64

In [295]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    12557
C2000     4693
C1200     3997
Other     1686
C2100     1623
C3000     1537
Name: count, dtype: int64

In [296]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df)

In [297]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(columns=["IS_SUCCESSFUL"]).values


# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [298]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model OTPIMZIED

In [299]:
# Optimize the Model
number_input_features = len(X_train[0])

# Import additional necessary libraries
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, BatchNormalization
import tensorflow as tf

# Define the model - deep neural net
nn_optimized = tf.keras.models.Sequential()

# First hidden layer with ReLU activation
hidden_nodes_layer1 = 512  # Increased number of neurons
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Second hidden layer
hidden_nodes_layer2 = 256  # Increased number of neurons
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Third hidden layer
hidden_nodes_layer3 = 128  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Fourth hidden layer
hidden_nodes_layer4 = 64  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Fifth hidden layer (optional)
hidden_nodes_layer5 = 32  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Output layer
nn_optimized.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))  # For binary classification

# Compile the model with a lower learning rate
nn_optimized.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=["accuracy"])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with validation split
fit_model_optimized = nn_optimized.fit(X_train_scaled, y_train, epochs=200, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the optimized model using the test data
model_loss_optimized, model_accuracy_optimized = nn_optimized.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Optimized Loss: {model_loss_optimized}, Optimized Accuracy: {model_accuracy_optimized}")

# Export our model to HDF5 file
nn.save("AlphabetSoupCharityOptimized.h5")


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.5132 - loss: 0.8400 - val_accuracy: 0.6888 - val_loss: 0.6467
Epoch 2/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 26ms/step - accuracy: 0.5723 - loss: 0.7484 - val_accuracy: 0.6967 - val_loss: 0.6126
Epoch 3/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m-10s[0m -20603us/step - accuracy: 0.6074 - loss: 0.7077 - val_accuracy: 0.7126 - val_loss: 0.5935
Epoch 4/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6396 - loss: 0.6727 - val_accuracy: 0.7197 - val_loss: 0.5789
Epoch 5/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6561 - loss: 0.6555 - val_accuracy: 0.7200 - val_loss: 0.5727
Epoch 6/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6712 - loss: 0.6441 - val_accuracy: 0.7269 - val_loss: 0.5683
Epoch 7/200
[1m490/490



## Optimization With more Layers and swish

In [None]:
# Import necessary libraries
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, BatchNormalization
import tensorflow as tf

# Define the model - deep neural net with 8 hidden layers
nn_optimized = tf.keras.models.Sequential()

# First hidden layer
hidden_nodes_layer1 = 512
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Second hidden layer
hidden_nodes_layer2 = 256
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Third hidden layer
hidden_nodes_layer3 = 128
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Fourth hidden layer
hidden_nodes_layer4 = 128
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Fifth hidden layer
hidden_nodes_layer5 = 64
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Sixth hidden layer
hidden_nodes_layer6 = 64
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer6, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Seventh hidden layer
hidden_nodes_layer7 = 32
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer7, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Eighth hidden layer
hidden_nodes_layer8 = 16
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer8, activation='swish'))
nn_optimized.add(Dropout(0.4))
nn_optimized.add(BatchNormalization())

# Output layer
nn_optimized.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the model with a lower learning rate
nn_optimized.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=["accuracy"])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with validation split
fit_model_optimized = nn_optimized.fit(X_train_scaled, y_train, epochs=200, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the optimized model using the test data
model_loss_optimized, model_accuracy_optimized = nn_optimized.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Optimized Loss: {model_loss_optimized}, Optimized Accuracy: {model_accuracy_optimized}")

nn.save("AlphabetSoupCharityOptimized2.h5")
