<a href="https://colab.research.google.com/github/SMKSmith/deep-learning-challenge/blob/main/AlphabetSoupCharity_Optimisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Preprocessing**

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

In [None]:
# Drop columns, 'EIN' and 'NAME'.
app_df = application_df.drop(columns= ['EIN'])
app_df

In [None]:
# Find the number of unique values in each column.
app_df.nunique()

In [None]:
# Use value counts to determine balancing
app_df["APPLICATION_TYPE"].value_counts()

In [None]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
app_to_replace = ['T9','T13','T12','T2','T25', 'T14', 'T29', 'T15', 'T17']

# Replace in dataframe
for app in app_to_replace:
    app_df['APPLICATION_TYPE'] = app_df['APPLICATION_TYPE'].replace(app,"Other_1")

# Check binning was successful
app_df['APPLICATION_TYPE'].value_counts()

In [None]:
# Look at CLASSIFICATION value counts for binning
app_count = app_df["CLASSIFICATION"].value_counts()

app_count[app_count>9]

In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`

class_replace = app_count[app_count<100].index

# Replace in dataframe
for cls in class_replace:
    app_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other_2")
    
# Check to make sure binning was successful
app_df['CLASSIFICATION'].value_counts()

In [None]:
# Show binning values for NAME
app_df["NAME"].value_counts()

In [None]:
# Show binning values for CLASSIFICATION 
app_df_NAME = app_df["NAME"].value_counts()

app_df_NAME[app_df_NAME > 400]

In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
NAME_replace = app_df_NAME[app_df_NAME < 100].index

# Replace in dataframe
for othername in NAME_replace:
    app_df['NAME'] = app_df['NAME'].replace(othername,"Other_3")
    
# Show binning values
app_df['NAME'].value_counts()

In [None]:
# Convert data to numeric with `pd.get_dummies`
dum_NAME = pd.get_dummies(app_df['NAME'])
dum_APPLICATION_TYPE = pd.get_dummies(app_df['APPLICATION_TYPE'])
dum_CLASSIFICATION = pd.get_dummies(app_df['CLASSIFICATION'])
dum_AFFILIATION = pd.get_dummies(app_df['AFFILIATION'])
dum_USE_CASE = pd.get_dummies(app_df['USE_CASE'])
dum_ORGANIZATION = pd.get_dummies(app_df['ORGANIZATION'])
dum_INCOME_AMT = pd.get_dummies(app_df['INCOME_AMT'])
dum_SPECIAL_CONSIDERATIONS = pd.get_dummies(app_df['SPECIAL_CONSIDERATIONS'])

concat1 = pd.concat([app_df, dummies_NAME, dummies_APPLICATION_TYPE, dummies_CLASSIFICATION, dummies_AFFILIATION, dummies_USE_CASE, dummies_ORGANIZATION,dummies_INCOME_AMT], axis="columns")
concat1

In [None]:
# Drop categorical data
concat_df = concat1.drop(columns=['NAME','APPLICATION_TYPE', 'CLASSIFICATION','AFFILIATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT',"SPECIAL_CONSIDERATIONS"])
concat_df

In [None]:
# Split our preprocessed data into our features and target arrays
y = concat_df['IS_SUCCESSFUL']
X = concat_df.drop(columns='IS_SUCCESSFUL')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# **Compile, Train and Evaluate the Model**

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="relu", input_dim=137))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation="tanh"))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="sigmoid"))

# fouth hidder layer
nn.add(tf.keras.layers.Dense(units=10, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
model_fit = nn.fit(X_train_scaled, y_train, epochs=10)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
nn.save('AlphabetSoupCharity_Optimisation.ipynb')

# **Alternative Model: the Decision Tree Model**

In [None]:
 from sklearn import tree
 # Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [None]:
 # Fit the model
model = model.fit(X_train_scaled, y_train)

In [None]:
 # Make predictions using testing data
predictions = model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cmatrix = confusion_matrix(y_test, predictions)
cmatrix_df = pd.DataFrame(
    cmatrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
accu_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cmatrix_df)
print(f"Accuracy Score : {accu_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# **Alternative model: the Random Forest Model**

In [None]:
 # Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
rforest_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
 # Fitting the model
rforest_model = rforest_model.fit(X_train_scaled, y_train)

In [None]:
 # Make predictions using the testing data
predictions = rforest_model.predict(X_test_scaled)

In [None]:
# Calculate the confusion matrix
cmatrix = confusion_matrix(y_test, predictions)
cmatrix_df = pd.DataFrame(
    cmatrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
accu_score = accuracy_score(y_test, predictions)

In [None]:
 # Displaying results
print("Confusion Matrix")
display(cmatrix_df)
print(f"Accuracy Score : {accu_score}")
print("Classification Report")
print(classification_report(y_test, predictions))