In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf

#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [81]:
# Drop unnecessary columns and potentially uninformative feature
application_df = application_df.drop(columns=['EIN', 'NAME', 'SPECIAL_CONSIDERATIONS'])

application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,142590,1


In [82]:
# Simplify binning for 'APPLICATION_TYPE' and 'CLASSIFICATION' columns
application_counts = application_df['APPLICATION_TYPE'].value_counts()
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].apply(lambda x: 'Other' if application_counts[x] < 500 else x)

classification_counts = application_df['CLASSIFICATION'].value_counts()
application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].apply(lambda x: 'Other' if classification_counts[x] < 100 else x)


In [83]:
# One-hot encode categorical variables
application_df = pd.get_dummies(application_df)

In [85]:
# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [87]:

# Apply oversampling to balance the classes
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)

# Define a simpler model with two hidden layers
simpler_model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=80, activation='relu', input_dim=X_train_resampled.shape[1]),
    tf.keras.layers.Dense(units=40, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [88]:
# Compile the model
simpler_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [90]:
# Train the model with resampled data
history = simpler_model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=32, validation_split=0.2)


Epoch 1/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7010 - loss: 0.5945 - val_accuracy: 0.7143 - val_loss: 0.5753
Epoch 2/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7301 - loss: 0.5548 - val_accuracy: 0.6986 - val_loss: 0.5783
Epoch 3/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7308 - loss: 0.5511 - val_accuracy: 0.7123 - val_loss: 0.5652
Epoch 4/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7299 - loss: 0.5491 - val_accuracy: 0.7148 - val_loss: 0.5642
Epoch 5/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7369 - loss: 0.5448 - val_accuracy: 0.7087 - val_loss: 0.5720
Epoch 6/100
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7317 - loss: 0.5439 - val_accuracy: 0.7183 - val_loss: 0.5518
Epoch 7/100
[1m730/73

In [93]:
# Evaluate the model
loss, accuracy = simpler_model.evaluate(X_test_scaled, y_test)
print(f"Refined Model Loss: {loss}")
print(f"Refined Model Accuracy: {accuracy}")

[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7248 - loss: 0.5666
Refined Model Loss: 0.5663918256759644
Refined Model Accuracy: 0.7259474992752075


In [94]:
# Save the refined model
refined_model.save("AlphabetSoupCharity_Refined_Optimization.h5")



Despite various optimization attempts, the model achieved a consistent accuracy around 72%. These include: Modifying the model architecture (adding/removing layers and neurons),
Using dropout and early stopping,
Adjusting activation functions and learning rates
Oversampling the data to address class imbalance
