## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tabulate import tabulate

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'])

In [18]:
# Check the number of unique values in each column
unique_counts = application_df.nunique()

# Convert unique counts to a DataFrame for tabular display
unique_counts_df = pd.DataFrame({"Column": unique_counts.index, "Unique Values": unique_counts.values})

# Display the table for unique value counts
print("\n🔹 Unique Value Counts per Column:")
print(tabulate(unique_counts_df, headers="keys", tablefmt="fancy_grid"))


🔹 Unique Value Counts per Column:
╒════╤══════════════════════════════╤═════════════════╕
│    │ Column                       │   Unique Values │
╞════╪══════════════════════════════╪═════════════════╡
│  0 │ STATUS                       │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  1 │ ASK_AMT                      │            8747 │
├────┼──────────────────────────────┼─────────────────┤
│  2 │ IS_SUCCESSFUL                │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  3 │ APPLICATION_TYPE_Other       │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  4 │ APPLICATION_TYPE_T10         │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  5 │ APPLICATION_TYPE_T19         │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  6 │ APPLICATION_TYPE_T3          │               2 │
├────┼──────────────────────────────┼─────────────────┤
│  7 │ APPLIC

In [20]:
# Identify columns with more than 10 unique values
high_cardinality_cols = unique_counts[unique_counts > 10].index

In [21]:
# Store and display value counts for high-cardinality columns
for col in high_cardinality_cols:
    col_counts = application_df[col].value_counts().reset_index()
    col_counts.columns = ["Category", "Count"]

    print(f"\n🔹 Value Counts for Column: {col}")
    print(tabulate(col_counts, headers="keys", tablefmt="fancy_grid"))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
│ 6247 │  21008           │       1 │
├──────┼──────────────────┼─────────┤
│ 6248 │  18484           │       1 │
├──────┼──────────────────┼─────────┤
│ 6249 │ 138640           │       1 │
├──────┼──────────────────┼─────────┤
│ 6250 │  37580           │       1 │
├──────┼──────────────────┼─────────┤
│ 6251 │ 119378           │       1 │
├──────┼──────────────────┼─────────┤
│ 6252 │ 177570           │       1 │
├──────┼──────────────────┼─────────┤
│ 6253 │  36995           │       1 │
├──────┼──────────────────┼─────────┤
│ 6254 │ 278158           │       1 │
├──────┼──────────────────┼─────────┤
│ 6255 │ 682012           │       1 │
├──────┼──────────────────┼─────────┤
│ 6256 │  20360           │       1 │
├──────┼──────────────────┼─────────┤
│ 6257 │      1.75737e+07 │       1 │
├──────┼──────────────────┼─────────┤
│ 6258 │  83302           │       1 │
├──────┼──────────────────┼─────────┤
│ 6259 │ 263230        

In [4]:
# Determine the number of unique values in each column.
print("\nNumber of unique values in each column:")
print(tabulate(application_df.nunique().reset_index(), headers=["Column", "Unique Values"], tablefmt="fancy_grid"))


Number of unique values in each column:
╒════╤════════════════════════╤═════════════════╕
│    │ Column                 │   Unique Values │
╞════╪════════════════════════╪═════════════════╡
│  0 │ APPLICATION_TYPE       │              17 │
├────┼────────────────────────┼─────────────────┤
│  1 │ AFFILIATION            │               6 │
├────┼────────────────────────┼─────────────────┤
│  2 │ CLASSIFICATION         │              71 │
├────┼────────────────────────┼─────────────────┤
│  3 │ USE_CASE               │               5 │
├────┼────────────────────────┼─────────────────┤
│  4 │ ORGANIZATION           │               4 │
├────┼────────────────────────┼─────────────────┤
│  5 │ STATUS                 │               2 │
├────┼────────────────────────┼─────────────────┤
│  6 │ INCOME_AMT             │               9 │
├────┼────────────────────────┼─────────────────┤
│  7 │ SPECIAL_CONSIDERATIONS │               2 │
├────┼────────────────────────┼─────────────────┤
│  8 │ AS

In [5]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()
print("\nApplication Type Counts:")
print(tabulate(application_type_counts.reset_index(), headers=["Application Type", "Count"], tablefmt="fancy_grid"))


Application Type Counts:
╒════╤════════════════════╤═════════╕
│    │ Application Type   │   Count │
╞════╪════════════════════╪═════════╡
│  0 │ T3                 │   27037 │
├────┼────────────────────┼─────────┤
│  1 │ T4                 │    1542 │
├────┼────────────────────┼─────────┤
│  2 │ T6                 │    1216 │
├────┼────────────────────┼─────────┤
│  3 │ T5                 │    1173 │
├────┼────────────────────┼─────────┤
│  4 │ T19                │    1065 │
├────┼────────────────────┼─────────┤
│  5 │ T8                 │     737 │
├────┼────────────────────┼─────────┤
│  6 │ T7                 │     725 │
├────┼────────────────────┼─────────┤
│  7 │ T10                │     528 │
├────┼────────────────────┼─────────┤
│  8 │ T9                 │     156 │
├────┼────────────────────┼─────────┤
│  9 │ T13                │      66 │
├────┼────────────────────┼─────────┤
│ 10 │ T12                │      27 │
├────┼────────────────────┼─────────┤
│ 11 │ T2               

In [6]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_type_counts[application_type_counts < 500].index)
# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [7]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df['CLASSIFICATION'].value_counts()
print("\nClassification Counts:")
print(tabulate(classification_counts.reset_index(), headers=["Classification", "Count"], tablefmt="fancy_grid"))


Classification Counts:
╒════╤══════════════════╤═════════╕
│    │ Classification   │   Count │
╞════╪══════════════════╪═════════╡
│  0 │ C1000            │   17326 │
├────┼──────────────────┼─────────┤
│  1 │ C2000            │    6074 │
├────┼──────────────────┼─────────┤
│  2 │ C1200            │    4837 │
├────┼──────────────────┼─────────┤
│  3 │ C3000            │    1918 │
├────┼──────────────────┼─────────┤
│  4 │ C2100            │    1883 │
├────┼──────────────────┼─────────┤
│  5 │ C7000            │     777 │
├────┼──────────────────┼─────────┤
│  6 │ C1700            │     287 │
├────┼──────────────────┼─────────┤
│  7 │ C4000            │     194 │
├────┼──────────────────┼─────────┤
│  8 │ C5000            │     116 │
├────┼──────────────────┼─────────┤
│  9 │ C1270            │     114 │
├────┼──────────────────┼─────────┤
│ 10 │ C2700            │     104 │
├────┼──────────────────┼─────────┤
│ 11 │ C2800            │      95 │
├────┼──────────────────┼─────────┤
│ 12

In [8]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_counts_filtered = classification_counts[classification_counts > 1]
display(classification_counts_filtered)

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [9]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)
# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df)

In [11]:
# Split our preprocessed data into our features and target arrays
X = application_df.drop(columns=['IS_SUCCESSFUL']).values
y = application_df['IS_SUCCESSFUL'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=80, activation='relu', input_dim=X_train_scaled.shape[1]))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
history = nn.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7055 - loss: 0.5935
Epoch 2/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7278 - loss: 0.5554
Epoch 3/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7263 - loss: 0.5536
Epoch 4/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7305 - loss: 0.5515
Epoch 5/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7337 - loss: 0.5475
Epoch 6/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7303 - loss: 0.5492
Epoch 7/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7280 - loss: 0.5520
Epoch 8/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7338 - loss: 0.5446
Epoch 9/100
[1m858/858[0m [3

In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 1s - 2ms/step - accuracy: 0.7278 - loss: 0.5689
Loss: 0.5688794255256653, Accuracy: 0.7278425693511963


In [17]:
# Export our model to HDF5 file
nn.save('charity_optimization_model.keras')
print(nn.save)

<bound method Model.save of <Sequential name=sequential, built=True>>
