## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=["EIN", "NAME","STATUS","SPECIAL_CONSIDERATIONS","USE_CASE",])

## Optimization added, removed outliers for ask_amt

In [3]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = application_df['ASK_AMT'].quantile(0.25)
Q3 = application_df['ASK_AMT'].quantile(0.75)
IQR = Q3 - Q1

# Determine the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = application_df[(application_df['ASK_AMT'] < lower_bound) | (application_df['ASK_AMT'] > upper_bound)]

# Print the outliers
print("Outliers in ASK_AMT:")
print(outliers[['ASK_AMT']])

# Remove outliers from the DataFrame
application_df_cleaned = application_df[(application_df['ASK_AMT'] >= lower_bound) & (application_df['ASK_AMT'] <= upper_bound)]

application_df = application_df_cleaned

# Check the shape of the new DataFrame to see how many rows were removed
print(f"Original DataFrame shape: {application_df.shape}")
print(f"Cleaned DataFrame shape: {application_df_cleaned.shape}")


Outliers in ASK_AMT:
        ASK_AMT
1        108590
4        142590
6         31452
7       7508025
8         94389
...         ...
34280    401661
34281     56261
34284   3384856
34289     20197
34298  36500179

[8206 rows x 1 columns]
Original DataFrame shape: (26093, 7)
Cleaned DataFrame shape: (26093, 7)


In [4]:
print(application_df.describe())

# Display a sample of the DataFrame to visualize the data
print(application_df.sample(10))

            ASK_AMT  IS_SUCCESSFUL
count  26093.000000   26093.000000
mean    5082.225348       0.526425
std      593.637940       0.499311
min     5000.000000       0.000000
25%     5000.000000       0.000000
50%     5000.000000       1.000000
75%     5000.000000       1.000000
max    11854.000000       1.000000
      APPLICATION_TYPE       AFFILIATION CLASSIFICATION ORGANIZATION  \
8798                T3  CompanySponsored          C1000  Association   
18519               T3       Independent          C2000        Trust   
23839               T3  CompanySponsored          C2700  Association   
11638               T3  CompanySponsored          C2000  Association   
30258               T3       Independent          C1000        Trust   
28667               T3  CompanySponsored          C1000        Trust   
18958               T3  CompanySponsored          C2100        Trust   
15017               T3  CompanySponsored          C2100        Trust   
19755               T3       Independ

In [5]:
# Determine the number of unique values in each column.
application_df.nunique()

Unnamed: 0,0
APPLICATION_TYPE,12
AFFILIATION,6
CLASSIFICATION,65
ORGANIZATION,4
INCOME_AMT,9
ASK_AMT,656
IS_SUCCESSFUL,2


In [6]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_type_counts = application_df.APPLICATION_TYPE.value_counts()
application_type_counts

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,20086
T4,1331
T19,999
T5,894
T6,882
T8,669
T7,633
T10,508
T13,54
T9,18


In [7]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_type_counts[application_type_counts < 100].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,20086
T4,1331
T19,999
T5,894
T6,882
T8,669
T7,633
T10,508
Other,91


In [8]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df.CLASSIFICATION.value_counts()
classification_counts

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,12557
C2000,4693
C1200,3997
C2100,1623
C3000,1537
...,...
C1732,1
C1728,1
C4120,1
C1245,1


In [9]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_counts[classification_counts > 1]

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,12557
C2000,4693
C1200,3997
C2100,1623
C3000,1537
C7000,546
C1700,223
C4000,129
C5000,106
C1270,89


In [10]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,12557
C2000,4693
C1200,3997
Other,1686
C2100,1623
C3000,1537


In [11]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df)

In [12]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(columns=["IS_SUCCESSFUL"]).values


# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model OTPIMZIED

In [14]:
# Optimize the Model
number_input_features = len(X_train[0])

# Import additional necessary libraries
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout, BatchNormalization
import tensorflow as tf

# Define the model - deep neural net
nn_optimized = tf.keras.models.Sequential()

# First hidden layer with ReLU activation
hidden_nodes_layer1 = 512  # Increased number of neurons
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Second hidden layer
hidden_nodes_layer2 = 256  # Increased number of neurons
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Third hidden layer
hidden_nodes_layer3 = 128  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Fourth hidden layer
hidden_nodes_layer4 = 64  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Fifth hidden layer (optional)
hidden_nodes_layer5 = 32  # New layer added
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation='relu'))
nn_optimized.add(Dropout(0.4))  # Increased dropout for regularization
nn_optimized.add(BatchNormalization())  # Adding batch normalization

# Output layer
nn_optimized.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))  # For binary classification

# Compile the model with a lower learning rate
nn_optimized.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=["accuracy"])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with validation split
fit_model_optimized = nn_optimized.fit(X_train_scaled, y_train, epochs=200, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the optimized model using the test data
model_loss_optimized, model_accuracy_optimized = nn_optimized.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Optimized Loss: {model_loss_optimized}, Optimized Accuracy: {model_accuracy_optimized}")



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.5160 - loss: 0.8122 - val_accuracy: 0.6837 - val_loss: 0.6310
Epoch 2/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5802 - loss: 0.7361 - val_accuracy: 0.7008 - val_loss: 0.5980
Epoch 3/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.6346 - loss: 0.6725 - val_accuracy: 0.7228 - val_loss: 0.5799
Epoch 4/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6511 - loss: 0.6644 - val_accuracy: 0.7261 - val_loss: 0.5769
Epoch 5/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.6751 - loss: 0.6409 - val_accuracy: 0.7251 - val_loss: 0.5696
Epoch 6/200
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.6871 - loss: 0.6267 - val_accuracy: 0.7261 - val_loss: 0.5648
Epoch 7/200
[1

In [15]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity.h5")

NameError: name 'nn' is not defined



## Optimization