## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
#  YOUR CODE GOES HERE
columns_to_drop = ["EIN", "NAME"]

# Drop the specified columns from the DataFrame
application_df = application_df.drop(columns=columns_to_drop, axis=1)
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
# Finding attribute columns
application_categories = application_df.dtypes[application_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column of object type
columns = application_df[application_categories].nunique()

# Iterate through the columns and print the unique value counts for each column
# we iterate through every column in the dataframe, some of them of object type where we found the unique count
for column in application_df.columns:
    if column in columns.index:
        data_type = application_df[column].dtype
        #print(f"{column}           (Data Type: {data_type}) - {columns[column]} unique value(s)")
        print(f"{column.ljust(40)} (Data Type: {data_type}) - {columns[column]} unique value(s)")
    else:
        print(f"{column.ljust(20)} numeric- {application_df[column].nunique()} unique value(s)")



APPLICATION_TYPE                         (Data Type: object) - 17 unique value(s)
AFFILIATION                              (Data Type: object) - 6 unique value(s)
CLASSIFICATION                           (Data Type: object) - 71 unique value(s)
USE_CASE                                 (Data Type: object) - 5 unique value(s)
ORGANIZATION                             (Data Type: object) - 4 unique value(s)
STATUS               numeric- 2 unique value(s)
INCOME_AMT                               (Data Type: object) - 9 unique value(s)
SPECIAL_CONSIDERATIONS                   (Data Type: object) - 2 unique value(s)
ASK_AMT              numeric- 8747 unique value(s)
IS_SUCCESSFUL        numeric- 2 unique value(s)


In [4]:
# Look at APPLICATION_TYPE value counts for binning

#  YOUR CODE GOES HERE
grouped_df = application_df.groupby("APPLICATION_TYPE").size().reset_index(name="COUNT")
sorted_df = grouped_df.sort_values(by="COUNT", ascending=False)
sorted_df = sorted_df.reset_index(drop=True)
print(sorted_df)


   APPLICATION_TYPE  COUNT
0                T3  27037
1                T4   1542
2                T6   1216
3                T5   1173
4               T19   1065
5                T8    737
6                T7    725
7               T10    528
8                T9    156
9               T13     66
10              T12     27
11               T2     16
12              T14      3
13              T25      3
14              T29      2
15              T15      2
16              T17      1


In [5]:
# Choose a cutoff value and create a list of application types to be replaced
# (Types we will bin together are (T9, T13, T12, T2, T14, T25, T29, T15, T17)
# use the variable name `application_types_to_replace`
#  YOUR CODE GOES HERE
application_types_to_replace = ["T9", "T13", "T12", "T2", "T14", "T25", "T29", "T15", "T17"]

# Replace the specified values in the "APPLICATION_TYPE" column with "Other"
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(application_types_to_replace, "Other")

# Check the value counts after replacing
print(application_df['APPLICATION_TYPE'].value_counts())

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64


In [6]:
# Look at CLASSIFICATION value counts for binning
#  YOUR CODE GOES HERE
grouped_df2 = application_df.groupby("CLASSIFICATION").size().reset_index(name="COUNT")
sorted_df2 = grouped_df2.sort_values(by="COUNT", ascending=False)
sorted_df2 = sorted_df2.reset_index(drop=True)
print(sorted_df2)

   CLASSIFICATION  COUNT
0           C1000  17326
1           C2000   6074
2           C1200   4837
3           C3000   1918
4           C2100   1883
..            ...    ...
66          C2190      1
67          C2380      1
68          C2500      1
69          C2561      1
70          C8210      1

[71 rows x 2 columns]


In [7]:
# You may find it helpful to look at CLASSIFICATION value counts >1
#  YOUR CODE GOES HERE
filtered_df = sorted_df2[sorted_df2["COUNT"] > 1]
filtered_df.reset_index(drop=True, inplace=True)

print(filtered_df)


   CLASSIFICATION  COUNT
0           C1000  17326
1           C2000   6074
2           C1200   4837
3           C3000   1918
4           C2100   1883
5           C7000    777
6           C1700    287
7           C4000    194
8           C5000    116
9           C1270    114
10          C2700    104
11          C2800     95
12          C7100     75
13          C1300     58
14          C1280     50
15          C1230     36
16          C1400     34
17          C2300     32
18          C7200     32
19          C1240     30
20          C8000     20
21          C7120     18
22          C1500     16
23          C1800     15
24          C6000     15
25          C1250     14
26          C8200     11
27          C1278     10
28          C1238     10
29          C1235      9
30          C1237      9
31          C7210      7
32          C1720      6
33          C4100      6
34          C2400      6
35          C1600      5
36          C1257      5
37          C2710      3
38          C1260      3


In [8]:
# Choose a cutoff value and create a list of classifications to be replaced
# 1883 will be my cutoff
# use the variable name `classifications_to_replace`
#  YOUR CODE GOES HERE
classifications_to_replace = ["C7000","C1700","C4000","C5000","C1270","C2700","C2800",
                              "C7100","C1300","C1280","C1230","C1400","C2300","C7200",
                              "C1240","C8000","C7120","C1500","C1800","C6000","C1250",
                              "C8200","C1278","C1238","C1235","C1237","C7210","C1720",
                              "C4100","C2400","C1600","C1257","C2710","C1260","C0",
                              "C1267","C1246","C1256","C3200","C1234",
                              "C1370","C1236","C2190","C4200","C5200","C1248","C2600",
                              "C1590","C6100","C1820","C1900","C2380","C2570","C1580",
                              "C3700","C2500","C1570","C1283","C2170","C1732","C1728",
                              "C4120","C8210","C4500","C1245","C2561","C2150"]
# Replace in dataframe
application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(classifications_to_replace,"Other")
    
# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

#print(application_df['CLASSIFICATION'].value_counts())


C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
#  YOUR CODE GOES HERE
numerical_cols = application_df.select_dtypes(include=['int', 'float']).columns
categorical_cols = application_df.select_dtypes(include=['object']).columns
# Use get_dummies() to one-hot encode only the categorical columns
one_hot_encoded = pd.get_dummies(application_df[categorical_cols])
application_df = pd.concat([application_df[numerical_cols], one_hot_encoded], axis=1)
column_types = application_df.dtypes

print(column_types)

STATUS                          int64
ASK_AMT                         int64
IS_SUCCESSFUL                   int64
APPLICATION_TYPE_Other          uint8
APPLICATION_TYPE_T10            uint8
APPLICATION_TYPE_T19            uint8
APPLICATION_TYPE_T3             uint8
APPLICATION_TYPE_T4             uint8
APPLICATION_TYPE_T5             uint8
APPLICATION_TYPE_T6             uint8
APPLICATION_TYPE_T7             uint8
APPLICATION_TYPE_T8             uint8
AFFILIATION_CompanySponsored    uint8
AFFILIATION_Family/Parent       uint8
AFFILIATION_Independent         uint8
AFFILIATION_National            uint8
AFFILIATION_Other               uint8
AFFILIATION_Regional            uint8
CLASSIFICATION_C1000            uint8
CLASSIFICATION_C1200            uint8
CLASSIFICATION_C2000            uint8
CLASSIFICATION_C2100            uint8
CLASSIFICATION_C3000            uint8
CLASSIFICATION_Other            uint8
USE_CASE_CommunityServ          uint8
USE_CASE_Heathcare              uint8
USE_CASE_Oth

## Compile, Train and Evaluate the Model

In [10]:
def final_callback(message):
    print(f"Final callback: {message}")

In [11]:
def loss_accuracy_callback(epoch, loss, accuracy):
    print(f"Epoch {epoch}: Loss={loss:.4f}, Accuracy={accuracy:.4f}")

In [12]:
def main_process(callback, X_train_scaled, y_train, nn, report_interval=5):
    # Train the model and store the training history
    fit_model = nn.fit(X_train_scaled, y_train, epochs=100, verbose=0)
    training_history = fit_model.history
    
    print("Training has started.")
    epochs = 100
    
    # Report loss and accuracy at the specified intervals
    for epoch in range(1, epochs + 1):
        if epoch % report_interval == 0 or epoch == epochs:
            loss = training_history['loss'][epoch - 1]
            accuracy = training_history['accuracy'][epoch - 1]

            callback(epoch, loss, accuracy)

    result = "Task completed."
    final_callback(result)  
    print("Main process finished.")



## Optimisation
### In our original model, we created data frame application_df, which still exists.
- 1. I will create a pca method on this data frame which has had bucketing


In [13]:
# 1  pca method on application_df which has had bucketing performed for the original model.
# checking that data is still in data frame
application_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [14]:
# Create a PCA model instance and set `n_components=3`.
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [15]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
bucketted_pca = pca.fit_transform(application_df)
# View the first five rows of the DataFrame. 
bucketted_pca[:5]

array([[-2.76419868e+06,  5.90894936e-01,  1.68838184e+00],
       [-2.66060868e+06, -5.83620857e-01,  5.19588895e-02],
       [-2.76419868e+06,  1.94092231e+00,  7.40532738e-01],
       [-2.76250668e+06, -6.37766352e-02, -6.30339762e-01],
       [-2.62660868e+06, -9.31493176e-01,  8.09351731e-01]])

In [16]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_

array([1.00000000e+00, 1.08525466e-16, 5.83194850e-17])

In [17]:
print(application_df.index)

RangeIndex(start=0, stop=34299, step=1)



**Question:** What is the total explained variance of the three principal components?

**Answer:** 1.0    = approx 100%

In [18]:
# Create a new DataFrame with the PCA data.
df_bucketted_pca = pd.DataFrame(
    bucketted_pca,
    columns=["PC1","PC2","PC3"]
)
# Creating a DataFrame with the PCA data
df_bucketted_pca.head()


Unnamed: 0,PC1,PC2,PC3
0,-2764199.0,0.590895,1.688382
1,-2660609.0,-0.583621,0.051959
2,-2764199.0,1.940922,0.740533
3,-2762507.0,-0.063777,-0.63034
4,-2626609.0,-0.931493,0.809352


In [19]:
# Split our preprocessed data into our features and target arrays
y2 = application_df["IS_SUCCESSFUL"].values
X2 = df_bucketted_pca.values 
# Split the preprocessed data into a training and testing dataset
#  YOUR CODE GOES HERE
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 78)

In [20]:
import numpy as np

# Assuming you have already defined y and X

# Number of records in y
num_records_y2 = y2.shape[0]

# Number of records in X
num_records_X2 = X2.shape[0]

print("Number of records in y2:", num_records_y2)
print("Number of records in X2:", num_records_X2)


Number of records in y2: 34299
Number of records in X2: 34299


In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X2_scaler = scaler.fit(X2_train)

# Scale the data
X2_train_scaled = X2_scaler.transform(X2_train)
X2_test_scaled = X2_scaler.transform(X2_test)

## Compile, Train and Evaluate my PCA bucketted Model

In [22]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE
import tensorflow as tf
number_input_features = len(X2_train[0])
hidden_nodes_layer1 = 6
hidden_nodes_layer2 = 8
nn = tf.keras.models.Sequential()

# First hidden layer
#  YOUR CODE GOES HERE
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1,
input_dim=number_input_features, activation = "relu")
)
# I am setting the regularization
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


#nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
# Check the structure of the model

nn.summary()







Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 24        
                                                                 
 dense_1 (Dense)             (None, 6)                 42        
                                                                 
 dense_2 (Dense)             (None, 8)                 56        
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Compile the model
#  YOUR CODE GOES HERE
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
main_process(loss_accuracy_callback, X2_train_scaled, y2_train, nn)

Training has started.
Epoch 5: Loss=0.3813, Accuracy=0.8240
Epoch 10: Loss=0.3096, Accuracy=0.8562
Epoch 15: Loss=0.2993, Accuracy=0.8623
Epoch 20: Loss=0.2914, Accuracy=0.8657
Epoch 25: Loss=0.2873, Accuracy=0.8663
Epoch 30: Loss=0.2852, Accuracy=0.8664
Epoch 35: Loss=0.2074, Accuracy=0.9115
Epoch 40: Loss=0.2000, Accuracy=0.9116
Epoch 45: Loss=0.1830, Accuracy=0.9309
Epoch 50: Loss=0.1784, Accuracy=0.9312
Epoch 55: Loss=0.1749, Accuracy=0.9313
Epoch 60: Loss=0.1722, Accuracy=0.9309
Epoch 65: Loss=0.1690, Accuracy=0.9318
Epoch 70: Loss=0.1674, Accuracy=0.9322
Epoch 75: Loss=0.1658, Accuracy=0.9329
Epoch 80: Loss=0.1642, Accuracy=0.9328
Epoch 85: Loss=0.1631, Accuracy=0.9344
Epoch 90: Loss=0.1622, Accuracy=0.9352
Epoch 95: Loss=0.1614, Accuracy=0.9341
Epoch 100: Loss=0.1610, Accuracy=0.9351
Final callback: Task completed.
Main process finished.


In [25]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X2_test_scaled,y2_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.1648 - accuracy: 0.9286 - 426ms/epoch - 2ms/step
Loss: 0.16480527818202972, Accuracy: 0.9286297559738159


In [26]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimisation.h5")