In [1]:
# Import dependencies
import pandas as pd
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Import SQLAlchemy
import sqlalchemy
from sqlalchemy import create_engine
from config_sm import db_password
from sqlalchemy.ext.automap import automap_base


# Machine Learning imports.
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Database connection to PgAdmin & SQL, convert to dataframe

db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/City_Surf_Project"
engine = create_engine(db_string).connect()

students_df= pd.read_sql_table('machine_learning_data', con=engine)

In [3]:
students_df['distinct_program_count'].value_counts()

1     131
2      42
3       6
4       2
16      1
10      1
7       1
5       1
Name: distinct_program_count, dtype: int64

In [4]:
# Write a function that converts distinc_program_count to 1 program or more than 1 program 
# repeat or not repeat
students_df["distinct_program_count"] = students_df["distinct_program_count"].replace({1:"one-time", 2: "return", 3:"return", 4:"return",
                                               5:"return", 7: "return", 16:"return", 10:"return"})
students_df['distinct_program_count'].value_counts()
    

one-time    131
return       54
Name: distinct_program_count, dtype: int64

In [24]:
students_df.rename(columns={"distinct_program_count": "likely_to_return?"})

Unnamed: 0,likely to return?,home_zip_code,grade_level_fy18_19,grade_level_fy19_20,csp_approx_enrollment_age,gender_Declined/Not Stated,gender_Female,gender_Male,gender_Transgender,city_Daly City,...,csp_enrollment_date_2019-04-18 00:00:00,csp_enrollment_date_2019-04-19 00:00:00,csp_enrollment_date_2019-05-10 00:00:00,csp_enrollment_date_2019-05-14 00:00:00,csp_enrollment_date_2019-05-30 00:00:00,csp_enrollment_date_2019-05-31 00:00:00,csp_enrollment_date_2019-06-03 00:00:00,csp_enrollment_date_2019-07-29 00:00:00,csp_enrollment_date_2019-08-05 00:00:00,csp_enrollment_date_2019-08-06 00:00:00
0,return,94103,9,10,14.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,one-time,94116,10,11,15.5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,one-time,94110,10,11,17.2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,one-time,94134,10,11,18.3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,return,94112,10,11,15.9,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,one-time,94112,9,10,14.9,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,return,94110,9,10,13.9,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,return,94112,9,10,14.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,one-time,94132,9,10,14.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,return,94122,9,10,14.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Convert Date of Birth and agency enrollment date to Datetime
students_df['csp_enrollment_date'] = pd.to_datetime(students_df['csp_enrollment_date'])
students_df.dtypes

participant_id                        int64
distinct_program_count               object
date_of_birth                        object
city                                 object
home_zip_code                         int64
race_ethnicity                       object
gender                               object
school_attending                     object
school_entry_date                    object
grade_level_fy18_19                   int64
grade_level_fy19_20                   int64
csp_enrollment_date          datetime64[ns]
csp_approx_enrollment_age           float64
dtype: object

In [6]:
# drop unneccesary columns 
students_df = students_df.drop(columns = ['participant_id', 'date_of_birth', "school_entry_date", "grade_level_fy18_19", "grade_level_fy19_20", ])

In [7]:
# # Create a function to change Gender to Numeric Values
# le = LabelEncoder()
# students_df['gender'] = le.fit_transform(students_df['gender'])


In [8]:
#Check to see which Encodings match the genders
# Gender Coding: 2 = Male, 1 = Female, 0 = Declined to State, 3 = Transgender
# students_df["gender"].value_counts()

In [9]:
# # Chose "gender, "city","home_zip_code","race_ethnicity","gender","school_attending",
# "school_entry_date","csp_enrollment_date","csp_approx_enrollment_age"
# # of classes attended because those are the inputs we want to be able to put in to see whether a student
# # would come to multiple City Surf Programs - this is a strong indicator of whether they enjoy
# the program

# # This would help enable the organization to see if they are missing certain populations, or have a tendency 
# # to specialize really well in a specific demographic or grade and help CSP improve outreach and give a breakdown of 
# # population served. It can show what populations CSP is missing, or which populations may like city surf better than others.

# Encode columns that are strings
encode_columns=["gender",
                "city",
                "race_ethnicity",
                "school_attending",
                "csp_enrollment_date"]

# Convert values to numeric
# source: https://stackoverflow.com/questions/30384995/randomforestclassfier-fit-valueerror-could-not-convert-string-to-float
students_df = pd.get_dummies(students_df,columns= encode_columns)
students_df.head()

Unnamed: 0,distinct_program_count,home_zip_code,grade_level_fy18_19,grade_level_fy19_20,csp_approx_enrollment_age,gender_Declined/Not Stated,gender_Female,gender_Male,gender_Transgender,city_Daly City,...,csp_enrollment_date_2019-04-18 00:00:00,csp_enrollment_date_2019-04-19 00:00:00,csp_enrollment_date_2019-05-10 00:00:00,csp_enrollment_date_2019-05-14 00:00:00,csp_enrollment_date_2019-05-30 00:00:00,csp_enrollment_date_2019-05-31 00:00:00,csp_enrollment_date_2019-06-03 00:00:00,csp_enrollment_date_2019-07-29 00:00:00,csp_enrollment_date_2019-08-05 00:00:00,csp_enrollment_date_2019-08-06 00:00:00
0,return,94103,9,10,14.6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,one-time,94116,10,11,15.5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,one-time,94110,10,11,17.2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,one-time,94134,10,11,18.3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,return,94112,10,11,15.9,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
students_df.shape

(185, 69)

In [11]:
# Create our target
y = students_df['likely_to_return?']

# Create our features
X = students_df.drop(columns="likely_to_return?" )

In [12]:

# Create X_train, X_test, y_train, y_test
# Split the X and y into X_train, X_test, y_train, y_test
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(138, 68)
(138,)
(47, 68)
(47,)


# Random Forest Classifier 


In [13]:
# want to see which variable has the most influence
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)



In [14]:
# Fitting the model
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['one-time', 'return', 'one-time', 'one-time', 'one-time', 'return',
       'one-time', 'one-time', 'return', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'return', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'return', 'one-time',
       'one-time', 'return', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'return', 'one-time',
       'one-time'], dtype=object)

In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,1
Actual 1,8,6


In [17]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,1
Actual 1,8,6


Accuracy Score : 0.8085106382978723
Classification Report
              precision    recall  f1-score   support

    one-time       0.80      0.97      0.88        33
      return       0.86      0.43      0.57        14

    accuracy                           0.81        47
   macro avg       0.83      0.70      0.72        47
weighted avg       0.82      0.81      0.79        47



In [19]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([1.28119951e-01, 5.20294299e-02, 5.00118117e-02, 1.41014482e-01,
       4.28177612e-03, 2.88096268e-02, 2.71567544e-02, 7.94372856e-04,
       3.38064365e-03, 4.05693642e-04, 5.45438121e-03, 1.24748566e-02,
       4.51231552e-02, 4.26584990e-02, 1.20728542e-02, 2.22517380e-02,
       5.02914379e-03, 4.90495526e-03, 8.61105949e-03, 4.69154744e-04,
       3.28268321e-04, 0.00000000e+00, 1.23395027e-04, 1.19039219e-02,
       2.02515757e-04, 1.19201258e-05, 4.31754732e-03, 0.00000000e+00,
       1.60735646e-04, 3.63147074e-03, 1.18234660e-03, 3.98760586e-03,
       2.14950308e-02, 8.27515295e-03, 6.80417927e-04, 1.91567566e-02,
       8.51870938e-04, 4.93045688e-04, 3.56452671e-04, 6.63147531e-04,
       4.86361694e-03, 1.92129053e-02, 9.25326898e-03, 1.77393421e-02,
       1.59085125e-02, 9.14318827e-03, 8.65810000e-03, 7.58532153e-03,
       9.49803721e-03, 3.08832348e-02, 0.00000000e+00, 8.69179534e-02,
       2.42940519e-02, 2.89460897e-03, 1.51777918e-02, 5.98563498e-03,
      

In [20]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1410144816800065, 'csp_approx_enrollment_age'),
 (0.12811995144603483, 'home_zip_code'),
 (0.08691795342016824, 'csp_enrollment_date_2019-02-11 00:00:00'),
 (0.052029429907611814, 'grade_level_fy18_19'),
 (0.05001181173852448, 'grade_level_fy19_20'),
 (0.04512315517451447, 'race_ethnicity_Asian/Pacific Islander'),
 (0.042658499039852874, 'race_ethnicity_Caucasian/ White'),
 (0.030883234762617756, 'csp_enrollment_date_2018-12-14 00:00:00'),
 (0.028809626802201404, 'gender_Female'),
 (0.02715675442387752, 'gender_Male'),
 (0.02429405191862435, 'csp_enrollment_date_2019-02-13 00:00:00'),
 (0.02225173803892497, 'race_ethnicity_Hispanic/Latinx'),
 (0.021495030768539317, 'school_attending_Lowell HS'),
 (0.020936361356619745, 'csp_enrollment_date_2019-08-06 00:00:00'),
 (0.01921290532098389, 'csp_enrollment_date_2018-10-02 00:00:00'),
 (0.019156756596134028, 'school_attending_Mission HS'),
 (0.017739342062375207, 'csp_enrollment_date_2018-10-04 00:00:00'),
 (0.01590851248062795, 'csp_enro