In [1]:
#Initally importing ALL potential libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# WILL need to drop student names from the Advanced Report!

df = pd.read_csv('../data/student_enrolled_demographics.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Camper ID,Age,2021 > Grade,Gender,School,Years at camp,Sibling count,Lead source,Primary family city,...,2021 > Unenrollment date,2021 > Is your child attending with a friend?,2021 ^Student Eval,2021 ^Expectations,2021 ^Trip Length,2021 ^^Interested in Another Trip?,2021 ^^^Future Trip?,2021 ^^^^Leader Rating of Student,2022 > Status,"2022 > trip code (ex. CAL 1, CAL 1A, CAL 1B, etc.) names (all)"
0,1,9522,19,11th,Male,Memphis University School,1,0,Friend,Memphis,...,,Yes,,,,,,Off the Charts (Highest),Cancelled,
1,2,9542,15,8th,Male,Charlotte Country Day School,2,1,Friend,Charlotte,...,,Yes,5,Met expectations,Too Short,Yes,,3,Enrolled,DOL 1B
2,3,3606,17,10th,Female,Collegiate School,2,0,Friend,Richmond,...,,Yes,5,Exceeded expectations,Just Right,Yes,"HWI, THA, FIJ",4,Alumni,
3,4,5791,19,12th,Male,Charlotte Latin School,2,0,Word of Mouth,Charlotte,...,,No,5,Met expectations,Just Right,No,,5,Alumni,
4,5,6847,18,11th,Female,Academy of Richmond County,1,1,Friend,Augusta,...,,Yes,5,Met expectations,Just Right,Not Sure,"THA, KEZ",Off the Charts (Highest),Alumni,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1727,1728,8481,15,8th,Female,Battle Ground Academy,3,0,Friend,Franklin,...,11-03-2021,No,5,Exceeded expectations,Just Right,Yes,FIJ,Off the Charts (Highest),Enrolled,BCR 2A
1728,1729,6484,18,10th,Female,Providence Day School,2,0,Friend,Charlotte,...,,No,5,Exceeded expectations,Just Right,Yes,KIL,3,Waitlist Only,
1729,1730,8723,16,8th,Female,Charlotte Country Day School,2,0,Friend,Charlotte,...,,Yes,5,Met expectations,Just Right,Not Sure,,5,Enrolled,CRO 2B
1730,1731,9403,18,10th,Female,Marin Academy,1,0,Friend,Belvedere Tiburon,...,,No,5,Met expectations,Just Right,Not Sure,KEZ,4,Alumni,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1732 entries, 0 to 1731
Data columns (total 30 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Unnamed: 0                                                      1732 non-null   int64  
 1   Camper ID                                                       1732 non-null   int64  
 2   Age                                                             1732 non-null   int64  
 3   2021 > Grade                                                    1732 non-null   object 
 4   Gender                                                          1732 non-null   object 
 5   School                                                          1732 non-null   object 
 6   Years at camp                                                   1732 non-null   int64  
 7   Sibling count                                      

In [5]:
df.isna().sum()

Unnamed: 0                                                           0
Camper ID                                                            0
Age                                                                  0
2021 > Grade                                                         0
Gender                                                               0
School                                                               0
Years at camp                                                        0
Sibling count                                                        0
Lead source                                                          3
Primary family city                                                  0
Primary family state                                                 2
Primary family country                                            1523
Primary family zip                                                   0
Do you have a copy of this year’s catalog?                        1668
2021 >

#### Notes on ALL Columns:
- Camper ID: Unique student identifier (eliminates the names of all students)
- Age: student age for the year at camp being analyzed.
- Current grade: student's grade the year being analyzed
- Gender: binary column with values that actually represent the student's sex
- School: this column contains the name of the student's school, this information is relatively granular and may not be very impactful as a feature.
- Years at camp: if the value is 1, then this is the student's first year traveling with Moondance, and so on up to 6 meaning this is the student's 6th summer!
- Sibling count: [DROP] This count does not represent the same information as the '2021 > Enrolled sibling count' and it is likely going to be dropped. 
- Lead source:[IMPUTE] Contains information about how a student was introduced to Moondance with various options such as 'Friend' or 'Website'. Impute the 3 nulls to be 'No Source'.
- Primary family address 1: [DROP] Details the students street address. Drop this column - too granular and too personal. **Maybe drop this when dropping the student first and last name**
- Primary family city: Details the city where the student/family lives. MAY be able to use this later for project goal #3
- Primary family country: [IMPUTE] Details the student's country where they currently reside. Impute all the nulls to be 'United States'
- Primary family state: [MANUAL CLEAN] Details the students home state. One of the students is from Atlanta, meaning they are likely from Georgia, the other is an international student who doesn't have a state. Maybe make an "INTL" state option
- Primary family zip: To be used for project goal #3
- Do you have a copy of this year’s catalog?: [DROP] Initally suspected this column would show how well distributed the marketing materials (specifically the catalog) are spread/whether or not there appears to be a correlation between enrollment and catalog possession. There are too many null values for this to be a useful feature.
- trip code & - trip + section name (all): [DROP - 'test' student, as null] Details the trip code and section the student was assigned/attened. There appears to be a 'test' student included in the data
- Enrolled sibling count [IMPUTE nulls to be zeros, OneHotEncode values (0, 1, 2)]: Details if a student has a family member also traveling with Moondance this summer. 
- 2021 > Status: [DROP] ALL should be 'Enrolled'
- Unenrollment date: [CLEAN] If the student changed trips there will be a value in this column. Can be mapped to be a boolean value rather than the date.
- 2021 > First enrollment request date: contains the first date the student attempted to enroll. This does not indicate whther or not the student was actually placed on a trip this day. IT MAY BE INTERESTING to engineer a feature that measures the number of days between first enrollment request/attempt and actual enrollment day to see if there's a correlation between the length of time it took for the child to get on a trip and their likelihood to try and come back                                 
- 2021 > Is your child attending with a friend?: [CLEAN] A Yes/No question that details whether or not a child will be travelling with a peer. To be cast as boolean values!                   
- ^Student Eval: [CLEAN - the "Off the Charts" distinction should be a '6' ENCODE the values are ordinal] This column details the rating the students gave their trip 1 - 6 ("Off the Charts"). 110 values appear to be missing. May need to impute these by filling in the average OR 'Missing Info'.
- ^Expectations: This feature contains values such as 'Met expectations', 'Exceeded expectations' and 'Did not meet expectations'. May be treated as an ordinal value to be encoded? Could cast these values as numbers also, for ease of working (1, 2, 3)
- ^Trip Length: Similar to the '^Expectations' feature, this column contains values 'Too Short', 'Just Right', 'Too Long'. May be treated as an ordinal value to be encoded? Could cast these values as numbers also, for ease of working (1, 2, 3)
- ^^Interested in Another Trip?: Options = 'Yes', 'Not Sure', 'No'
- ^^^Future Trip? Details the trip options that the student may be interested in for future summers!
- ^^^^Leader Rating of Student: Details the score the leader team provided regarding the student's behavior on the trip/their overall ability to meet trip expectations.
- 2022 > Status: This column is the 'target' as it details whether or not a student enrolled the following summer, which would indicate a positive (1) or negative (0) result.
- 2022 > trip code: This column details the actual trip the student enrolled in for the following season. Could be an interesting exploration to see if they ACTUALLY went on one of the trips that they listed that they may be interested in attending. Is a stretch, but could be used later in goals #2 or #3, so it will remain in the dataset even if it is initally dropped for goal #1.

In [6]:
df['Years at camp'].value_counts()

2    597
1    494
3    445
4    165
5     25
6      6
Name: Years at camp, dtype: int64

In [7]:
# OneHotEncode this out?
df['Lead source'].value_counts()

Friend                                 1134
Family                                  180
Word of Mouth                           150
I'm a Moondance Alumni                  116
I'm a sibling of a Moondance Alumni      49
Web Search                               31
Social Media                             19
Advertisement                            10
Competitor                               10
Catalog                                   9
Home Presentation                         9
School Auction                            3
Website                                   3
Camp Advisory Service                     2
Google Ad                                 2
Summer Opportunities Fair                 1
Teacher                                   1
Name: Lead source, dtype: int64

In [8]:
# Impute this value as 'No Source'
df.loc[df['Lead source'].isnull() == True]

Unnamed: 0.1,Unnamed: 0,Camper ID,Age,2021 > Grade,Gender,School,Years at camp,Sibling count,Lead source,Primary family city,...,2021 > Unenrollment date,2021 > Is your child attending with a friend?,2021 ^Student Eval,2021 ^Expectations,2021 ^Trip Length,2021 ^^Interested in Another Trip?,2021 ^^^Future Trip?,2021 ^^^^Leader Rating of Student,2022 > Status,"2022 > trip code (ex. CAL 1, CAL 1A, CAL 1B, etc.) names (all)"
222,223,6946,17,9th,Male,St. Davids School,1,0,,Raleigh,...,,Yes,5,Met expectations,Just Right,Yes,ICE,,Alumni,
485,486,8797,17,9th,Male,Berkeley Preparatory School,2,0,,Tampa,...,,No,5,Met expectations,Just Right,Yes,FIJ,4,Enrolled,CHA 2B
1502,1503,6934,15,8th,Male,Virginia Episcopal School (VES),3,2,,Greensboro,...,,Yes,5,Exceeded expectations,Just Right,Yes,FIJ,Off the Charts (Highest),Alumni,


In [9]:
# Add Georgia? And leave the student from Singapore as an INTL student
df.loc[df['Primary family state'].isnull() == True, 
       ['Primary family zip', 'Primary family city', 'Primary family country']]

Unnamed: 0,Primary family zip,Primary family city,Primary family country
483,30342,Atlanta,United States
998,238309,Singapore,INTL


In [10]:
# With the majority of this question being left blank anyhow, it's not helpful to include this feature.
df['Do you have a copy of this year’s catalog?'].value_counts()

No     38
Yes    26
Name: Do you have a copy of this year’s catalog?, dtype: int64

In [11]:
df.loc[df['Do you have a copy of this year’s catalog?'].isnull() == False, 'Lead source']

15      I'm a Moondance Alumni
17      I'm a Moondance Alumni
22      I'm a Moondance Alumni
66      I'm a Moondance Alumni
71                      Friend
                 ...          
1602    I'm a Moondance Alumni
1609    I'm a Moondance Alumni
1626    I'm a Moondance Alumni
1653    I'm a Moondance Alumni
1654    I'm a Moondance Alumni
Name: Lead source, Length: 64, dtype: object

In [12]:
#This appears to be a 'test' student and will need to be dropped.
df.loc[df['2021 > trip code (ex. CAL 1, CAL 1A, CAL 1B, etc.) names (all)'].isnull() == True]

Unnamed: 0.1,Unnamed: 0,Camper ID,Age,2021 > Grade,Gender,School,Years at camp,Sibling count,Lead source,Primary family city,...,2021 > Unenrollment date,2021 > Is your child attending with a friend?,2021 ^Student Eval,2021 ^Expectations,2021 ^Trip Length,2021 ^^Interested in Another Trip?,2021 ^^^Future Trip?,2021 ^^^^Leader Rating of Student,2022 > Status,"2022 > trip code (ex. CAL 1, CAL 1A, CAL 1B, etc.) names (all)"
1525,1526,9223,2,9th,Male,test,4,0,Catalog,Athens,...,"01-27-2022, 03-02-2022, 05-08-2022",Yes,,,,,,,Enrolled,


In [13]:
# Impute any nulls as 0, potentially OneHotEncode out the remaining values...
df['2021 > Enrolled sibling count'].value_counts()

1.0    384
2.0     45
Name: 2021 > Enrolled sibling count, dtype: int64

In [14]:
df.loc[df['2021 > Unenrollment date'].isnull() == False]

Unnamed: 0.1,Unnamed: 0,Camper ID,Age,2021 > Grade,Gender,School,Years at camp,Sibling count,Lead source,Primary family city,...,2021 > Unenrollment date,2021 > Is your child attending with a friend?,2021 ^Student Eval,2021 ^Expectations,2021 ^Trip Length,2021 ^^Interested in Another Trip?,2021 ^^^Future Trip?,2021 ^^^^Leader Rating of Student,2022 > Status,"2022 > trip code (ex. CAL 1, CAL 1A, CAL 1B, etc.) names (all)"
6,7,5550,18,11th,Female,North Atlanta High School,1,0,Friend,Atlanta,...,05-11-2022,No,5,Exceeded expectations,Just Right,Not Sure,THA; CHA; KEZ; MYS,5,Alumni,
14,15,7783,15,8th,Female,Park City High School,3,2,Word of Mouth,Park City,...,11-08-2021,No,5,Met expectations,Just Right,Yes,"SAF, FIJ",5,Enrolled,SAF 2B
19,20,7237,16,9th,Male,Summit High School,1,0,Friend,Bend,...,"05-03-2022, 07-11-2022",No,Off the Charts (Highest),Exceeded expectations,Just Right,Yes,,5,Alumni,
29,30,4553,19,11th,Male,St Andrews Episcopal,3,0,Friend,Austin,...,10-12-2021,No,5,Met expectations,Just Right,Yes,"KIL, THA, MYS",5,Alumni,
31,32,3143,19,12th,Female,Academy of Richmond County,4,4,Friend,Augusta,...,08-31-2021,No,Off the Charts (Highest),Exceeded expectations,Just Right,No,,Off the Charts (Highest),Alumni,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1703,1704,8367,15,8th,Female,Athens Academy,3,1,Word of Mouth,Athens,...,01-20-2022,No,5,Exceeded expectations,Just Right,Yes,SERVICE,Off the Charts (Highest),Enrolled,SPM 1B
1710,1711,7627,16,9th,Male,Charlottesville Day School,2,0,Friend,Charlottesville,...,03-11-2022,Yes,Off the Charts (Highest),Met expectations,Just Right,Not Sure,HWI,5,Alumni,
1719,1720,8697,17,10th,Female,The Westminster Schools,2,0,Friend,Atlanta,...,11-16-2021,No,5,Exceeded expectations,Just Right,Not Sure,NOR,5,Enrolled,CHA 4B
1721,1722,6421,17,9th,Female,RJ Reynolds High School,2,1,Friend,Winston-Salem,...,11-19-2021,Yes,Off the Charts (Highest),Exceeded expectations,Just Right,Yes,"THA, FIJ, CHA",4,Enrolled,KEZ 1B


In [15]:
df['2021 > Status'].value_counts()

Enrolled    1732
Name: 2021 > Status, dtype: int64

In [16]:
df['2021 > Is your child attending with a friend?'].value_counts()

No     1049
Yes     683
Name: 2021 > Is your child attending with a friend?, dtype: int64

In [17]:
df['2021 ^Expectations'].value_counts()

Met expectations             917
Exceeded expectations        670
Did not meet expectations     30
Name: 2021 ^Expectations, dtype: int64

In [18]:
df['2021 ^Trip Length'].value_counts()

Just Right    1315
Too Short      224
Too Long        77
Name: 2021 ^Trip Length, dtype: int64

In [19]:
df['2021 ^^Interested in Another Trip?'].value_counts()

Yes         1036
Not Sure     448
No           100
Name: 2021 ^^Interested in Another Trip?, dtype: int64