In [1]:
# !pip install keras-tuner

In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kerastuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import  Adam
import matplotlib.pyplot as plt


  import kerastuner as kt


In [3]:
df = pd.read_csv("Resources/charity_data.csv")
df.tail()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1
34298,996086871,WATERHOUSE CHARITABLE TR,T3,Independent,C1000,Preservation,Co-operative,1,1M-5M,N,36500179,0


In [4]:
#drop non-essential columns
funding_df = df.drop(columns=["EIN", "NAME"], index=1)

In [5]:
# look for NA values
print(f"{funding_df.isna().any()} \n")
# look for duplicates
print(f"Duplicated rows: {df.duplicated().any()}")

APPLICATION_TYPE          False
AFFILIATION               False
CLASSIFICATION            False
USE_CASE                  False
ORGANIZATION              False
STATUS                    False
INCOME_AMT                False
SPECIAL_CONSIDERATIONS    False
ASK_AMT                   False
IS_SUCCESSFUL             False
dtype: bool 

Duplicated rows: False


In [6]:
funding_df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8746
IS_SUCCESSFUL                2
dtype: int64

In [7]:
# identify rare application types for binning 
funding_df['APPLICATION_TYPE'].value_counts()

T3     27036
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [8]:
funding_df["APPLICATION_TYPE"].value_counts() < 10


T3     False
T4     False
T6     False
T5     False
T19    False
T8     False
T7     False
T10    False
T9     False
T13    False
T12    False
T2     False
T25     True
T14     True
T29     True
T15     True
T17     True
Name: APPLICATION_TYPE, dtype: bool

In [9]:
def bin_outliers(df,col,ls):
    for x in ls:
        df[col] = df[col].replace(x, "other")
    return df

In [10]:
outliers = funding_df['APPLICATION_TYPE'].value_counts().loc[lambda x: x<200].index

bin_outliers(funding_df,'APPLICATION_TYPE',outliers)\
    .head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
5,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1


In [11]:
funding_df['APPLICATION_TYPE'].value_counts()

T3       27036
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
other      276
Name: APPLICATION_TYPE, dtype: int64

In [12]:
funding_df['CLASSIFICATION'].value_counts().head(10)

C1000    17326
C2000     6073
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
Name: CLASSIFICATION, dtype: int64

In [13]:
cls_outliers = funding_df['CLASSIFICATION'].value_counts().loc[lambda x: x<1000].index
cls_outliers

Index(['C7000', 'C1700', 'C4000', 'C5000', 'C1270', 'C2700', 'C2800', 'C7100',
       'C1300', 'C1280', 'C1230', 'C1400', 'C2300', 'C7200', 'C1240', 'C8000',
       'C7120', 'C1500', 'C6000', 'C1800', 'C1250', 'C8200', 'C1238', 'C1278',
       'C1237', 'C1235', 'C7210', 'C2400', 'C1720', 'C4100', 'C1257', 'C1600',
       'C1260', 'C2710', 'C0', 'C3200', 'C1234', 'C1246', 'C1267', 'C1256',
       'C2190', 'C4200', 'C2600', 'C5200', 'C1370', 'C1248', 'C6100', 'C1820',
       'C1900', 'C1236', 'C3700', 'C2570', 'C1580', 'C1245', 'C2500', 'C1570',
       'C1283', 'C2380', 'C1732', 'C1728', 'C2170', 'C4120', 'C8210', 'C2561',
       'C4500', 'C2150'],
      dtype='object')

In [14]:
bin_outliers(funding_df, 'CLASSIFICATION', cls_outliers)\
    .head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
5,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1


In [15]:
funding_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6073
C1200     4837
other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [16]:
funding_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34298 entries, 0 to 34298
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   APPLICATION_TYPE        34298 non-null  object
 1   AFFILIATION             34298 non-null  object
 2   CLASSIFICATION          34298 non-null  object
 3   USE_CASE                34298 non-null  object
 4   ORGANIZATION            34298 non-null  object
 5   STATUS                  34298 non-null  int64 
 6   INCOME_AMT              34298 non-null  object
 7   SPECIAL_CONSIDERATIONS  34298 non-null  object
 8   ASK_AMT                 34298 non-null  int64 
 9   IS_SUCCESSFUL           34298 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.9+ MB


In [17]:
ask_outliers = funding_df['ASK_AMT'].loc[lambda x: x>1.0e+07]

In [18]:
ask_outliers.count()

412

In [19]:
amt_high = funding_df[funding_df.ASK_AMT.isin(ask_outliers)]
amt_low = funding_df[~funding_df.ASK_AMT.isin(ask_outliers)]
amt_high.groupby('IS_SUCCESSFUL').count()
# amt_low.groupby('IS_SUCCESSFUL').count()

Unnamed: 0_level_0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT
IS_SUCCESSFUL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,242,242,242,242,242,242,242,242,242
1,170,170,170,170,170,170,170,170,170


In [20]:
funding_df['ASK_AMT'].value_counts()

5000        25398
63981           3
6725            3
15583           3
10478           3
            ...  
3899729         1
5371754         1
30060           1
43091152        1
36500179        1
Name: ASK_AMT, Length: 8746, dtype: int64

In [21]:
funding_df['ASK_AMT'].describe()

count    3.429800e+04
mean     2.769276e+06
std      8.713172e+07
min      5.000000e+03
25%      5.000000e+03
50%      5.000000e+03
75%      7.738500e+03
max      8.597806e+09
Name: ASK_AMT, dtype: float64

In [22]:
funding_encoded = pd.get_dummies(funding_df, 
                                 columns=['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION','USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS'],
                                 drop_first=True)

In [23]:
funding_processed = funding_encoded.copy()

In [24]:
funding_processed

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,ORGANIZATION_Trust,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,5000,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6692,1,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,1,142590,1,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
5,1,5000,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,1,5000,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34295,1,5000,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34296,1,5000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34297,1,5000,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
funding_processed.describe()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,ORGANIZATION_Trust,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y
count,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,...,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0,34298.0
mean,0.999854,2769276.0,0.532393,0.031051,0.788268,0.044959,0.0342,0.035454,0.021138,0.021488,...,0.685608,0.021197,0.015832,0.098373,0.006997,0.027844,0.109248,0.004053,0.005394,0.000787
std,0.012073,87131720.0,0.498957,0.173459,0.408542,0.207217,0.181746,0.184927,0.143847,0.145007,...,0.46428,0.144041,0.124826,0.297823,0.083359,0.164529,0.311955,0.063533,0.073246,0.028047
min,0.0,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,5000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,5000.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,7738.5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,8597806000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
funding_processed.shape 

(34298, 37)

In [27]:
funding_processed.to_csv('Resources/funding_processed.csv',index=False)

In [50]:
#split preprocessed data into features and target arrays
X = funding_processed.drop(columns=['IS_SUCCESSFUL','STATUS'])
#target array
y = funding_processed['IS_SUCCESSFUL'].values

In [35]:
def split_data(data, X, y, seed=5):
    """splits feature and target data into test and train sets and scales feature data

    Args:
        model (class): model class name
        X (array): array containing feature column values
        y (1D array): array containing target variable
        seed (int): random seed value; default=5
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled


In [36]:
X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled = split_data(df,X,y)

In [37]:
X_train.shape[1]

36

In [38]:
# define initial model 
n_input = len(X_train_scaled[1])

nn_model = Sequential()

# hidden layers 1 and 2
nn_model.add(Dense(units=17, input_dim=n_input, activation="relu"))
nn_model.add(Dense(units= 5, activation="relu"))

# output layer
nn_model.add(Dense(units=1, activation="sigmoid"))

# view model structure
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 17)                629       
                                                                 
 dense_1 (Dense)             (None, 5)                 90        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 725
Trainable params: 725
Non-trainable params: 0
_________________________________________________________________


In [43]:
# compile model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [52]:
# cp = callback that saves the model's weight every 5 epochs
file_path = 'base_model/callback_weight'
cp = tf.keras.callbacks.ModelCheckpoint(filepath=file_path, save_weights_only=True, save_freq=5)

# train the model and save the weights
nn_model.fit(X_train_scaled, y_train, epochs=100, callbacks=cp)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x165efd3c2b0>

In [53]:
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5654 - accuracy: 0.7279 - 235ms/epoch - 875us/step
Loss: 0.5654239058494568, Accuracy: 0.7279300093650818
