In [1]:
# Import dependencies
import pandas as pd
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Import SQLAlchemy
import sqlalchemy
from sqlalchemy import create_engine
from config_sm import db_password
from sqlalchemy.ext.automap import automap_base


# Machine Learning imports.
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Database connection to PgAdmin & SQL, convert to dataframe

db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/City_Surf_Project"
engine = create_engine(db_string).connect()

students_df= pd.read_sql_table('machine_learning_data', con=engine)

In [3]:
students_df['distinct_program_count'].value_counts()

1     131
2      42
3       6
4       2
16      1
10      1
7       1
5       1
Name: distinct_program_count, dtype: int64

In [4]:
# Write a function that converts distinc_program_count to 1 program or more than 1 program 
# repeat or not repeat
students_df["distinct_program_count"] = students_df["distinct_program_count"].replace({1:"one-time", 2: "return", 3:"return", 4:"return",
                                               5:"return", 7: "return", 16:"return", 10:"return"})
students_df['distinct_program_count'].value_counts()
    

one-time    131
return       54
Name: distinct_program_count, dtype: int64

In [5]:
# drop unneccesary columns 
students_df = students_df.drop(columns = ['participant_id', 'date_of_birth',"school_entry_date", "grade_level_fy18_19", "grade_level_fy19_20", ])

In [6]:
students_df = students_df.rename(columns={"distinct_program_count": "likely_to_return?"})

In [7]:
# # Create a function to change Gender to Numeric Values
# le = LabelEncoder()
# students_df['gender'] = le.fit_transform(students_df['gender'])
students_df

Unnamed: 0,likely_to_return?,city,home_zip_code,race_ethnicity,gender,school_attending,csp_enrollment_date,csp_approx_enrollment_age
0,return,San Francisco,94103,Declined to state/Other,Male,Mission HS,11/29/2018,14.6
1,one-time,San Francisco,94116,Caucasian/ White,Female,Mission HS,10/2/2018,15.5
2,one-time,San Francisco,94110,Asian/Pacific Islander,Male,Mission HS,11/15/2018,17.2
3,one-time,San Francisco,94134,Asian/Pacific Islander,Male,Mission HS,10/2/2018,18.3
4,return,San Francisco,94112,Hispanic/Latinx,Female,Mission HS,2/14/2019,15.9
5,one-time,San Francisco,94112,African American/Other Black,Male,Lincoln (Abraham) HS,11/16/2018,14.9
6,return,San Francisco,94110,Hispanic/Latinx,Female,Lowell HS,10/2/2018,13.9
7,return,San Francisco,94112,Hispanic/Latinx,Male,Mission HS,2/11/2019,14.6
8,one-time,San Francisco,94132,Hispanic/Latinx,Male,Lincoln (Abraham) HS,2/14/2019,14.6
9,return,San Francisco,94122,Asian/Pacific Islander,Male,Lowell HS,10/2/2018,14.6


In [8]:
#Check to see which Encodings match the genders
# Gender Coding: 2 = Male, 1 = Female, 0 = Declined to State, 3 = Transgender
# students_df["gender"].value_counts()

In [9]:
# # Chose "gender, "city","home_zip_code","race_ethnicity","gender","school_attending",
# "school_entry_date","csp_enrollment_date","csp_approx_enrollment_age"
# # of classes attended because those are the inputs we want to be able to put in to see whether a student
# # would come to multiple City Surf Programs - this is a strong indicator of whether they enjoy
# the program

# # This would help enable the organization to see if they are missing certain populations, or have a tendency 
# # to specialize really well in a specific demographic or grade and help CSP improve outreach and give a breakdown of 
# # population served. It can show what populations CSP is missing, or which populations may like city surf better than others.

# Encode columns that are strings
encode_columns=["gender",
                "city",
                "race_ethnicity",
                "school_attending",
                "csp_enrollment_date"]

# Convert values to numeric
# source: https://stackoverflow.com/questions/30384995/randomforestclassfier-fit-valueerror-could-not-convert-string-to-float
students_df = pd.get_dummies(students_df,columns= encode_columns)
students_df.head()

Unnamed: 0,likely_to_return?,home_zip_code,csp_approx_enrollment_age,gender_Declined/Not Stated,gender_Female,gender_Male,gender_Transgender,city_Daly City,city_Richmond,city_San Francisco,...,csp_enrollment_date_4/19/2019,csp_enrollment_date_4/5/2019,csp_enrollment_date_5/10/2019,csp_enrollment_date_5/14/2019,csp_enrollment_date_5/30/2019,csp_enrollment_date_5/31/2019,csp_enrollment_date_6/3/2019,csp_enrollment_date_7/29/2019,csp_enrollment_date_8/5/2019,csp_enrollment_date_8/6/2019
0,return,94103,14.6,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,one-time,94116,15.5,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,one-time,94110,17.2,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,one-time,94134,18.3,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,return,94112,15.9,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
students_df.shape

(185, 67)

In [11]:
# Create our target
y = students_df['likely_to_return?']

# Create our features
X = students_df.drop(columns="likely_to_return?" )

In [12]:

# Create X_train, X_test, y_train, y_test
# Split the X and y into X_train, X_test, y_train, y_test
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(138, 66)
(138,)
(47, 66)
(47,)


# Random Forest Classifier 


In [13]:
# want to see which variable has the most influence
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)



In [14]:
# Fitting the model
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array(['one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'return', 'one-time', 'one-time', 'return', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'return', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'return', 'one-time',
       'one-time', 'return', 'one-time', 'one-time', 'one-time',
       'one-time', 'one-time', 'one-time', 'one-time', 'one-time',
       'one-time'], dtype=object)

In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,1
Actual 1,10,4


In [17]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,1
Actual 1,10,4


Accuracy Score : 0.7659574468085106
Classification Report
              precision    recall  f1-score   support

    one-time       0.76      0.97      0.85        33
      return       0.80      0.29      0.42        14

    accuracy                           0.77        47
   macro avg       0.78      0.63      0.64        47
weighted avg       0.77      0.77      0.72        47



In [19]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([1.29181012e-01, 1.90517489e-01, 4.29642555e-03, 2.62066182e-02,
       2.34502219e-02, 5.65336342e-04, 4.04937833e-03, 7.12336057e-04,
       4.19872693e-03, 1.75111220e-02, 4.34362353e-02, 4.46285065e-02,
       1.44747501e-02, 2.20293502e-02, 5.29706126e-03, 5.96746266e-03,
       1.73952216e-02, 9.08035028e-05, 1.08077335e-03, 0.00000000e+00,
       3.81426565e-04, 1.47470222e-02, 1.26613189e-03, 1.16267352e-05,
       9.48588415e-03, 0.00000000e+00, 3.39441755e-05, 2.46779595e-03,
       1.97723173e-03, 3.92935188e-03, 2.23591558e-02, 1.26964910e-02,
       4.84172334e-04, 2.68010076e-02, 2.62656010e-03, 4.04469948e-04,
       1.20060897e-03, 1.35075623e-03, 5.79420092e-03, 0.00000000e+00,
       1.82871776e-02, 2.73914038e-02, 1.30372218e-02, 1.17395296e-02,
       1.03221546e-02, 7.03837694e-03, 8.98881299e-03, 1.04073000e-02,
       3.64702891e-02, 7.72407853e-02, 1.90634832e-02, 3.29138769e-03,
       1.86939038e-02, 1.27393680e-02, 8.01572370e-03, 1.29322031e-03,
      

In [20]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.19051748911933833, 'csp_approx_enrollment_age'),
 (0.12918101219666522, 'home_zip_code'),
 (0.07724078531238857, 'csp_enrollment_date_2/11/2019'),
 (0.04462850645130389, 'race_ethnicity_Caucasian/ White'),
 (0.04343623530846559, 'race_ethnicity_Asian/Pacific Islander'),
 (0.03647028907367291, 'csp_enrollment_date_12/14/2018'),
 (0.02739140377703771, 'csp_enrollment_date_10/2/2018'),
 (0.027233496370256075, 'csp_enrollment_date_8/6/2019'),
 (0.02680100758127337, 'school_attending_Mission HS'),
 (0.02620661817285844, 'gender_Female'),
 (0.023450221889798742, 'gender_Male'),
 (0.02235915581226248, 'school_attending_Lowell HS'),
 (0.022029350212143854, 'race_ethnicity_Hispanic/Latinx'),
 (0.019063483237736017, 'csp_enrollment_date_2/13/2019'),
 (0.018693903820022666, 'csp_enrollment_date_2/15/2019'),
 (0.018287177619790207, 'csp_enrollment_date_10/10/2018'),
 (0.017511122021729494, 'race_ethnicity_African American/Other Black'),
 (0.017395221585800265, 'school_attending_Aptos MS'),
 (0

## Originally Used Logistic Regression 

#### Found that Logistic Regression's accuracy score was not nearly as high as Random Forest Classifiers so we decided to use Random Forest Classifiers as our official Machine Learning Model

In [21]:
# Database connection to PgAdmin & SQL, convert to dataframe

db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/City_Surf_Project"
engine = create_engine(db_string).connect()

student_df= pd.read_sql_table('machine_learning_data', con=engine)

In [22]:
# Write a function that converts distinc_program_count to 1 program or more than 1 program 
# repeat or not repeat
student_df["distinct_program_count"] = student_df["distinct_program_count"].replace({1:"one-time", 2: "return", 3:"return", 4:"return",
                                               5:"return", 7: "return", 16:"return", 10:"return"})
student_df['distinct_program_count'].value_counts()
    

one-time    131
return       54
Name: distinct_program_count, dtype: int64

In [23]:
# drop unneccesary columns 
student_df = student_df.drop(columns = ['participant_id', 'date_of_birth', "school_entry_date", "grade_level_fy18_19", "grade_level_fy19_20", ])

In [24]:
students_df = student_df.rename(columns={"distinct_program_count": "likely_to_return?"})

In [25]:
# # Chose "gender, "city","home_zip_code","race_ethnicity","gender","school_attending",
# "school_entry_date","csp_enrollment_date","csp_approx_enrollment_age"
# # of classes attended because those are the inputs we want to be able to put in to see whether a student
# # would come to multiple City Surf Programs - this is a strong indicator of whether they enjoy
# the program

# # This would help enable the organization to see if they are missing certain populations, or have a tendency 
# # to specialize really well in a specific demographic or grade and help CSP improve outreach and give a breakdown of 
# # population served. It can show what populations CSP is missing, or which populations may like city surf better than others.

# Encode columns that are strings
encode_columns=["gender",
                "city",
                "race_ethnicity",
                "school_attending",
                "csp_enrollment_date"]

# Convert values to numeric
# source: https://stackoverflow.com/questions/30384995/randomforestclassfier-fit-valueerror-could-not-convert-string-to-float
student_df = pd.get_dummies(students_df,columns= encode_columns)
student_df.head()

Unnamed: 0,likely_to_return?,home_zip_code,csp_approx_enrollment_age,gender_Declined/Not Stated,gender_Female,gender_Male,gender_Transgender,city_Daly City,city_Richmond,city_San Francisco,...,csp_enrollment_date_4/19/2019,csp_enrollment_date_4/5/2019,csp_enrollment_date_5/10/2019,csp_enrollment_date_5/14/2019,csp_enrollment_date_5/30/2019,csp_enrollment_date_5/31/2019,csp_enrollment_date_6/3/2019,csp_enrollment_date_7/29/2019,csp_enrollment_date_8/5/2019,csp_enrollment_date_8/6/2019
0,return,94103,14.6,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,one-time,94116,15.5,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,one-time,94110,17.2,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,one-time,94134,18.3,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,return,94112,15.9,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
student_df.shape

(185, 67)

In [27]:
# Create our target
y = student_df['likely_to_return?']

# Create our features
X = student_df.drop(columns="likely_to_return?" )

In [28]:

# Create X_train, X_test, y_train, y_test
# Split the X and y into X_train, X_test, y_train, y_test
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(138, 66)
(138,)
(47, 66)
(47,)


In [29]:
# Split the X and y into X_train, X_test, y_train, y_test
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [31]:
# Calculated the balanced accuracy score

y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))


0.7021276595744681


In [32]:
# Display the confusion matrix

cm_df = confusion_matrix(y_test,y_pred)

In [33]:
# Print the classification report
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


array([[33,  0],
       [14,  0]])

Accuracy Score : 0.7659574468085106
Classification Report
              precision    recall  f1-score   support

    one-time       0.70      1.00      0.82        33
      return       0.00      0.00      0.00        14

    accuracy                           0.70        47
   macro avg       0.35      0.50      0.41        47
weighted avg       0.49      0.70      0.58        47



  _warn_prf(average, modifier, msg_start, len(result))
