## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from sklearn.svm import SVC

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN', 'NAME'], axis=1, inplace=True)
application_df.head()

In [None]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()

# Display the number of unique values in each column
print(unique_values)

In [None]:
# Look at APPLICATION_TYPE value counts for binning
application_type_values = application_df["APPLICATION_TYPE"].value_counts()

# Display the number of unique values in each column
print(application_type_values)

In [None]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_type_values[application_type_values<500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

In [None]:
# Look at CLASSIFICATION value counts for binning
# Display value counts for CLASSIFICATION column
classification_counts = application_df['CLASSIFICATION'].value_counts()
print(classification_counts)

In [None]:
# You may find it helpful to look at CLASSIFICATION value counts >1
# Filter and display value counts for CLASSIFICATION column greater than 1
classification_counts_greater_than_1 = application_df['CLASSIFICATION'].value_counts()[application_df['CLASSIFICATION'].value_counts() > 1]
print(classification_counts_greater_than_1)

In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts<1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")
    
# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
# Select categorical columns for conversion
categorical_cols = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS'] 

# Create dummy variables for categorical columns
application_df = pd.get_dummies(application_df, columns=categorical_cols)
application_df.head()

In [None]:
# Split our preprocessed data into our features and target arrays

# Features (all columns except 'IS_SUCCESSFUL' which is the target)
features = application_df.drop('IS_SUCCESSFUL', axis=1).values

# Target
target = application_df['IS_SUCCESSFUL'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Instantiate the SVM model
svm_model = SVC(kernel='rbf')  # 'rbf' kernel is a common choice; you can try others too

# Train the model
svm_model.fit(X_train_scaled, y_train)

# Make predictions
predictions = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']}  

# Instantiate the model
svm_model = SVC()

# Use GridSearchCV to find the best parameters
grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

# Get the best parameters and predict with the optimized model
best_params = grid.best_params_
optimized_model = grid.best_estimator_
predictions = optimized_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy after optimization: {accuracy}")
print(f"Best Parameters: {best_params}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.690 total time=  36.8s


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Visualize hyperparameter performance
results = pd.DataFrame(grid.cv_results_)
results = results.pivot_table(index='param_C', columns='param_gamma', values='mean_test_score')

plt.figure(figsize=(10, 6))
sns.heatmap(results, annot=True, cmap='viridis')
plt.title('Grid Search Mean Test Scores')
plt.show()

# Narrow down the parameter range
param_grid_narrow = {'C': [5, 10, 15],
                     'gamma': [1, 0.1, 0.01, 0.001],
                     'kernel': ['rbf']}

# Re-run GridSearchCV with the narrowed parameter range
grid_narrow = GridSearchCV(svm_model, param_grid_narrow, refit=True, verbose=3)
grid_narrow.fit(X_train, y_train)

# Get the best parameters and predict with the optimized model
best_params_narrow = grid_narrow.best_params_
optimized_model_narrow = grid_narrow.best_estimator_
predictions_narrow = optimized_model_narrow.predict(X_test)
accuracy_narrow = accuracy_score(y_test, predictions_narrow)

# Print the results
print(f"Model Accuracy after further optimization: {accuracy_narrow}")
print(f"Best Parameters after further optimization: {best_params_narrow}")

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, predictions_narrow)
classification_rep = classification_report(y_test, predictions_narrow)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = accuracy.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
svm_model.save("AlphabetSoupCharity_Optimized.h5")