In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np

In [3]:
# Upload file, convert to dataframe
attendance = pd.read_csv('../CSP_Data/activities_attendance.csv')
attendance

Unnamed: 0,Attendance ID,Activity Name,Activity Instance ID,Activity Label,Person ID,Date,Attended (y/n),Minutes Attended,Lead Staff Name,Individual Activity - Notes,Individual Activity - Referral Service,Individual Activity - Referral Agency,Individual Activity - Referral Date
0,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/10/2018,Y,360.0,,,,,
1,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/17/2018,Y,360.0,,,,,
2,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/24/2018,Y,360.0,,,,,
3,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/1/2018,Y,360.0,,,,,
4,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/15/2018,Y,360.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556,,Surfers In Leadership Training (SILT),3621424,SILT SUMMER CAMP SESSION 2: Booker T. Washington,47411223,6/26/2019,N,0.0,,,,,
5557,,Surfers In Leadership Training (SILT),3621424,SILT SUMMER CAMP SESSION 2: Booker T. Washington,47411223,6/27/2019,N,0.0,,,,,
5558,,Surfers In Leadership Training (SILT),3621424,SILT SUMMER CAMP SESSION 2: Booker T. Washington,47433659,6/25/2019,Y,420.0,,,,,
5559,,Surfers In Leadership Training (SILT),3621424,SILT SUMMER CAMP SESSION 2: Booker T. Washington,47433659,6/26/2019,Y,420.0,,,,,


In [4]:
# Check data types
attendance.dtypes

Attendance ID                             float64
Activity Name                              object
Activity Instance ID                        int64
Activity Label                             object
Person ID                                   int64
Date                                       object
Attended (y/n)                             object
Minutes Attended                          float64
Lead Staff Name                            object
Individual Activity - Notes               float64
Individual Activity - Referral Service    float64
Individual Activity - Referral Agency     float64
Individual Activity - Referral Date       float64
dtype: object

In [5]:
# Drop columns we do not want in schema
attendance = attendance.drop(columns = ["Attendance ID", "Activity Label", "Lead Staff Name", "Individual Activity - Notes", "Individual Activity - Referral Service", "Individual Activity - Referral Agency", "Individual Activity - Referral Date"])
attendance

Unnamed: 0,Activity Name,Activity Instance ID,Person ID,Date,Attended (y/n),Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0
...,...,...,...,...,...,...
5556,Surfers In Leadership Training (SILT),3621424,47411223,6/26/2019,N,0.0
5557,Surfers In Leadership Training (SILT),3621424,47411223,6/27/2019,N,0.0
5558,Surfers In Leadership Training (SILT),3621424,47433659,6/25/2019,Y,420.0
5559,Surfers In Leadership Training (SILT),3621424,47433659,6/26/2019,Y,420.0


In [6]:
# Rename columns
attendance_df = attendance.rename(columns={
                          'Person ID':'Participant ID',
                            'Attended (y/n)':'Attended'
})
attendance_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0
...,...,...,...,...,...,...
5556,Surfers In Leadership Training (SILT),3621424,47411223,6/26/2019,N,0.0
5557,Surfers In Leadership Training (SILT),3621424,47411223,6/27/2019,N,0.0
5558,Surfers In Leadership Training (SILT),3621424,47433659,6/25/2019,Y,420.0
5559,Surfers In Leadership Training (SILT),3621424,47433659,6/26/2019,Y,420.0


In [7]:
# Value counts for each program
attendance_df["Activity Name"].value_counts()

Surfing 101                              4981
Surfing 101 Middle School                 338
Surfers In Leadership Training (SILT)     242
Name: Activity Name, dtype: int64

# Create df by Program - SILT / Surfing 101 MS / Surfing 101 HS

In [8]:
# Filter by Program -SILT
is_silt = attendance_df['Activity Name'] == 'Surfers In Leadership Training (SILT)'
is_silt.head()

0    False
1    False
2    False
3    False
4    False
Name: Activity Name, dtype: bool

In [9]:
# Df for silt
silt_df = attendance_df[is_silt]
silt_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
4266,Surfers In Leadership Training (SILT),3421146,46843398,8/1/2018,N,0.0
4267,Surfers In Leadership Training (SILT),3421146,46843398,8/2/2018,N,0.0
4268,Surfers In Leadership Training (SILT),3421146,46853927,8/1/2018,N,0.0
4269,Surfers In Leadership Training (SILT),3421146,46853927,8/2/2018,N,0.0
4270,Surfers In Leadership Training (SILT),3421146,46854059,8/1/2018,Y,480.0
...,...,...,...,...,...,...
5556,Surfers In Leadership Training (SILT),3621424,47411223,6/26/2019,N,0.0
5557,Surfers In Leadership Training (SILT),3621424,47411223,6/27/2019,N,0.0
5558,Surfers In Leadership Training (SILT),3621424,47433659,6/25/2019,Y,420.0
5559,Surfers In Leadership Training (SILT),3621424,47433659,6/26/2019,Y,420.0


In [10]:
## Filter by Program - Middle School 101
is_surfing_101_MS = attendance_df['Activity Name'] == 'Surfing 101 Middle School '
is_surfing_101_MS

0        True
1        True
2        True
3        True
4        True
        ...  
5556    False
5557    False
5558    False
5559    False
5560    False
Name: Activity Name, Length: 5561, dtype: bool

In [11]:
# Df for MS Surfing 101
surf_101_MS_df = attendance_df[is_surfing_101_MS]
surf_101_MS_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0
...,...,...,...,...,...,...
5149,Surfing 101 Middle School,3508595,46908729,4/29/2019,N,0.0
5150,Surfing 101 Middle School,3508595,46908729,5/6/2019,Y,225.0
5151,Surfing 101 Middle School,3508595,46908729,5/13/2019,Y,225.0
5152,Surfing 101 Middle School,3508595,46908729,5/20/2019,Y,225.0


In [12]:
# Filter by HS Surfing 101
surf_101_HS = attendance_df['Activity Name'] == 'Surfing 101'
surf_101_HS

0       False
1       False
2       False
3       False
4       False
        ...  
5556    False
5557    False
5558    False
5559    False
5560    False
Name: Activity Name, Length: 5561, dtype: bool

In [13]:
# Df for HS surfing 101
surf_101_HS_df = attendance_df[surf_101_HS]
surf_101_HS_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
70,Surfing 101,3384580,46843252,8/20/2018,Y,180.0
71,Surfing 101,3384580,46843252,8/22/2018,Y,180.0
72,Surfing 101,3384580,46843252,8/24/2018,Y,300.0
73,Surfing 101,3384580,46843252,8/27/2018,Y,180.0
74,Surfing 101,3384580,46843252,8/29/2018,Y,180.0
...,...,...,...,...,...,...
5396,Surfing 101,3620940,46876883,4/22/2019,Y,360.0
5397,Surfing 101,3620979,46858126,5/20/2019,Y,360.0
5398,Surfing 101,3620979,46875344,5/20/2019,Y,360.0
5399,Surfing 101,3620979,46878126,5/20/2019,Y,360.0


In [14]:
# Amount of classes attended per students
value_counts_HS = surf_101_HS_df["Participant ID"].value_counts()
value_counts_HS

46848962    106
46875282    104
46876799    103
46869531    102
46863178    102
           ... 
46858930      1
46885507      1
46856557      1
46877315      1
46870550      1
Name: Participant ID, Length: 344, dtype: int64

In [15]:
# Who attended one class
one_time_HS = value_counts_HS == 1
one_time_HS

46848962    False
46875282    False
46876799    False
46869531    False
46863178    False
            ...  
46858930     True
46885507     True
46856557     True
46877315     True
46870550     True
Name: Participant ID, Length: 344, dtype: bool

In [16]:
# Isolate students who only attended one class
one_timers_HS = value_counts_HS[one_time_HS]
one_timers_HS

46862359    1
46858392    1
47027338    1
46862041    1
46889250    1
           ..
46858930    1
46885507    1
46856557    1
46877315    1
46870550    1
Name: Participant ID, Length: 161, dtype: int64

In [20]:
# Df of Students who only attended one class of Surfing 101 HS
one_timers_HS_df = pd.DataFrame(value_counts_HS[one_time_HS], index=None)
one_timers_HS_df.reset_index(inplace=True) 
one_timers_HS_df = one_timers_HS_df.rename(columns={
                    'index':'Participant_ID',
                    'Participant ID':'Classes_Attended'
})
one_timers_HS_df

Unnamed: 0,Participant_ID,Classes_Attended
0,46862359,1
1,46858392,1
2,47027338,1
3,46862041,1
4,46889250,1
...,...,...
156,46858930,1
157,46885507,1
158,46856557,1
159,46877315,1


In [21]:
one_timers_HS_df.to_csv(r'../CSP_Data/one_timers_HS.csv',index=False)

In [55]:
# Value counts for Surfing 101 MS 
surf_101_MS_df['Participant ID'].value_counts().head()

46897549    33
46904597    18
46901108    18
46895808    18
46907807    18
Name: Participant ID, dtype: int64

In [54]:
# Values Counts for SILT
silt_df['Participant ID'].value_counts().head()

46876991    12
46864770    12
46878556    10
46848962    10
46864094     9
Name: Participant ID, dtype: int64