In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import the data
file_path = Path("../Resources/charity_data.csv")
df = pd.read_csv(file_path)
df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
df.drop(["EIN", "NAME", "ORGANIZATION", "SPECIAL_CONSIDERATIONS"], axis=1, inplace=True)
df.head(5)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,STATUS,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,1,0,5000,1
1,T3,Independent,C2000,Preservation,1,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,1,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,1,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,1,100000-499999,142590,1


In [4]:
# Convert categorical data to numeric with `pd.get_dummies`
df_numeric = pd.get_dummies(df)
df_numeric.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,...,USE_CASE_ProductDev,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,1,5000,1,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,1,108590,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,5000,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3,1,6692,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,142590,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
# Split the data into X_train, X_test, y_train, y_test
X = df_numeric.drop("IS_SUCCESSFUL", axis=1)
y = df_numeric["IS_SUCCESSFUL"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model and print the model score

In [7]:
# Create a logistic Regression Model
classifier = LogisticRegression()
classifier

LogisticRegression()

In [8]:
# Fit (train) our model by using the training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression()

In [9]:
# Validate the model by using the test data
print("Logistic Regression model")
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Logistic Regression model
Training Data Score: 0.7237210387187063
Testing Data Score: 0.7220991253644314


# Train a Random Forest Classifier model and print the model score

In [10]:
# Create a Random Forest Classifier Model
clf = RandomForestClassifier()
clf

RandomForestClassifier()

In [11]:
# Fit (train) our model by using the training data
clf.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [12]:
# Validate the model by using the test data
print("Random Forest Classifier Model")
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

Random Forest Classifier Model
Training Data Score: 0.8118488570984295
Testing Data Score: 0.7128862973760933
