In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# Upload file, convert to dataframe
attendance_df = pd.read_csv('../CSP_Data/activities_attendance.csv')
attendance_df

Unnamed: 0,Attendance ID,Activity Name,Activity Instance ID,Activity Label,Person ID,Date,Attended (y/n),Minutes Attended,Lead Staff Name,Individual Activity - Notes,Individual Activity - Referral Service,Individual Activity - Referral Agency,Individual Activity - Referral Date
0,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/10/2018,Y,360.0,,,,,
1,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/17/2018,Y,360.0,,,,,
2,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,9/24/2018,Y,360.0,,,,,
3,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/1/2018,Y,360.0,,,,,
4,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/15/2018,Y,360.0,,,,,
5,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/22/2018,Y,360.0,,,,,
6,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46887178,10/29/2018,Y,240.0,,,,,
7,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46891897,9/10/2018,Y,360.0,,,,,
8,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46891897,9/17/2018,Y,360.0,,,,,
9,,Surfing 101 Middle School,3383538,Aptos Surfing 101 Course,46891897,9/24/2018,Y,360.0,,,,,


In [3]:
# Check data types
attendance_df.dtypes

Attendance ID                             float64
Activity Name                              object
Activity Instance ID                        int64
Activity Label                             object
Person ID                                   int64
Date                                       object
Attended (y/n)                             object
Minutes Attended                          float64
Lead Staff Name                            object
Individual Activity - Notes               float64
Individual Activity - Referral Service    float64
Individual Activity - Referral Agency     float64
Individual Activity - Referral Date       float64
dtype: object

In [4]:
# Drop columns we do not want in schema
attendance_df = attendance_df.drop(columns = ["Attendance ID", "Activity Label", "Lead Staff Name", "Individual Activity - Notes", "Individual Activity - Referral Service", "Individual Activity - Referral Agency", "Individual Activity - Referral Date"])
attendance_df.head()

Unnamed: 0,Activity Name,Activity Instance ID,Person ID,Date,Attended (y/n),Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0


In [5]:
# Rename columns
attendance_df = attendance_df.rename(columns={
                          'Person ID':'Participant ID',
                            'Attended (y/n)':'Attended'
})
attendance_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0
5,Surfing 101 Middle School,3383538,46887178,10/22/2018,Y,360.0
6,Surfing 101 Middle School,3383538,46887178,10/29/2018,Y,240.0
7,Surfing 101 Middle School,3383538,46891897,9/10/2018,Y,360.0
8,Surfing 101 Middle School,3383538,46891897,9/17/2018,Y,360.0
9,Surfing 101 Middle School,3383538,46891897,9/24/2018,Y,360.0


In [6]:
# Value counts for each program
attendance_df["Activity Name"].value_counts()

Surfing 101                              4981
Surfing 101 Middle School                 338
Surfers In Leadership Training (SILT)     242
Name: Activity Name, dtype: int64

In [7]:
attendance_df["Activity Instance ID"].value_counts()

3384580    3185
3417045     499
3417059     486
3474515     237
3508595     108
3508583      88
3517784      85
3421146      82
3476979      72
3383538      70
3621424      60
3621419      60
3472807      55
3621418      30
3440566      29
3473499      24
3476373      23
3527022      20
3474149      18
3528219      17
3429906      17
3476388      17
3440661      16
3476343      15
3475642      14
3499217      14
3420417      14
3620598      14
3535966      13
3620940      12
3420508      12
3474121      12
3538029      12
3528375      11
3526992      10
3621415      10
3535885      10
3536052       9
3499320       9
3454339       8
3488181       8
3528332       8
3499380       8
3538008       7
3620656       6
3480978       5
3620600       5
3499367       5
3527068       4
3620979       4
3476438       4
Name: Activity Instance ID, dtype: int64

In [8]:
# Find any repeat participants
repeat_participant = attendance_df.groupby(["Participant ID"])
repeat_participant.agg({"Activity Instance ID": "nunique"})

Unnamed: 0_level_0,Activity Instance ID
Participant ID,Unnamed: 1_level_1
46842888,3
46843252,1
46843305,1
46843348,1
46843349,2
46843398,1
46843808,2
46843825,2
46843828,1
46843859,2


In [9]:
# Found that there are 427 more students included on this sheet than the student profile sheet 
# Will need to drop attendees who are not included in the students demographic profile sheet
# when perfomring merge between the 2 tables

# This exploratory pull of a weird student shows that it is possible to attend 16 different activities
# this can be explained if a student only attends one off field trips or attends field trips in addition to 
# surfing 101 courses

weird_student = attendance_df[attendance_df["Participant ID"] == 46847012]
weird_student.head()


Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
270,Surfing 101,3384580,46847012,2/8/2019,Y,300.0
271,Surfing 101,3384580,46847012,2/11/2019,Y,180.0
272,Surfing 101,3384580,46847012,2/13/2019,Y,180.0
273,Surfing 101,3384580,46847012,2/15/2019,Y,300.0
274,Surfing 101,3384580,46847012,2/20/2019,Y,180.0


# Create df by Program - SILT / Surfing 101 MS / Surfing 101 HS

In [10]:
# Filter by Program -SILT
is_silt = attendance_df['Activity Name'] == 'Surfers In Leadership Training (SILT)'
is_silt.head()

0    False
1    False
2    False
3    False
4    False
Name: Activity Name, dtype: bool

In [11]:
# Df for silt
silt_df = attendance_df[is_silt]
silt_df.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
4266,Surfers In Leadership Training (SILT),3421146,46843398,8/1/2018,N,0.0
4267,Surfers In Leadership Training (SILT),3421146,46843398,8/2/2018,N,0.0
4268,Surfers In Leadership Training (SILT),3421146,46853927,8/1/2018,N,0.0
4269,Surfers In Leadership Training (SILT),3421146,46853927,8/2/2018,N,0.0
4270,Surfers In Leadership Training (SILT),3421146,46854059,8/1/2018,Y,480.0


In [12]:
## Filter by Program - Middle School 101
is_surfing_101_MS = attendance_df['Activity Name'] == 'Surfing 101 Middle School '
is_surfing_101_MS.head()

0    True
1    True
2    True
3    True
4    True
Name: Activity Name, dtype: bool

In [13]:
# Df for MS Surfing 101
surf_101_MS_df = attendance_df[is_surfing_101_MS]
surf_101_MS_df.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,9/10/2018,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,9/17/2018,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,9/24/2018,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,10/1/2018,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,10/15/2018,Y,360.0


In [14]:
# Filter by HS Surfing 101
surf_101_HS = attendance_df['Activity Name'] == 'Surfing 101'
surf_101_HS.head()

0    False
1    False
2    False
3    False
4    False
Name: Activity Name, dtype: bool

In [15]:
# Df for HS surfing 101
surf_101_HS_df = attendance_df[surf_101_HS]
surf_101_HS_df.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
70,Surfing 101,3384580,46843252,8/20/2018,Y,180.0
71,Surfing 101,3384580,46843252,8/22/2018,Y,180.0
72,Surfing 101,3384580,46843252,8/24/2018,Y,300.0
73,Surfing 101,3384580,46843252,8/27/2018,Y,180.0
74,Surfing 101,3384580,46843252,8/29/2018,Y,180.0


In [16]:
# Amount of classes attended per students
value_counts_HS = surf_101_HS_df["Participant ID"].value_counts()
value_counts_HS.head()

46848962    106
46875282    104
46876799    103
46869531    102
46863178    102
Name: Participant ID, dtype: int64

In [17]:
# Who attended one class
one_time_HS = value_counts_HS == 1
one_time_HS.head()

46848962    False
46875282    False
46876799    False
46869531    False
46863178    False
Name: Participant ID, dtype: bool

In [18]:
# Isolate students who only attended one class
one_timers_HS = value_counts_HS[one_time_HS]
one_timers_HS.head()

46862359    1
46858392    1
47027338    1
46862041    1
46889250    1
Name: Participant ID, dtype: int64

In [19]:
# Df of Students who only attended one class of Surfing 101 HS
one_timers_HS_df = pd.DataFrame(value_counts_HS[one_time_HS], index=None)
one_timers_HS_df.reset_index(inplace=True) 
one_timers_HS_df = one_timers_HS_df.rename(columns={
                    'index':'Participant ID',
                    'Participant ID':'Classes Attended'
})
one_timers_HS_df.head()

Unnamed: 0,Participant ID,Classes Attended
0,46862359,1
1,46858392,1
2,47027338,1
3,46862041,1
4,46889250,1


In [20]:
# Value counts for Surfing 101 MS 
surf_101_MS_df['Participant ID'].value_counts().head()

46897549    33
46904597    18
46901108    18
46895808    18
46907807    18
Name: Participant ID, dtype: int64

In [21]:
# Values Counts for SILT
silt_df['Participant ID'].value_counts().head()

46876991    12
46864770    12
46878556    10
46848962    10
46864094     9
Name: Participant ID, dtype: int64

#  New DFs based on Activity Instance ID


In [22]:
#Check Dtypes and convert to Datetime
surf_101_MS_df.dtypes


surf_101_MS_df["Date"] = pd.to_datetime(surf_101_MS_df["Date"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [23]:
surf_101_MS_df["Activity Instance ID"].value_counts()

3508595    108
3508583     88
3476979     72
3383538     70
Name: Activity Instance ID, dtype: int64

In [24]:
# Sort by Activity Instance ID, 
sem_1 = surf_101_MS_df[surf_101_MS_df["Activity Instance ID"]== 3508595]
sem_1.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
5046,Surfing 101 Middle School,3508595,46891989,2019-04-01,N,0.0
5047,Surfing 101 Middle School,3508595,46891989,2019-04-08,N,0.0
5048,Surfing 101 Middle School,3508595,46891989,2019-04-15,N,0.0
5049,Surfing 101 Middle School,3508595,46891989,2019-04-22,Y,225.0
5050,Surfing 101 Middle School,3508595,46891989,2019-04-29,Y,225.0


In [25]:
# Create Data Frames that can be filtered for Attendance percentage 
sem_1_df = pd.DataFrame(sem_1)
sem_1_df.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
5046,Surfing 101 Middle School,3508595,46891989,2019-04-01,N,0.0
5047,Surfing 101 Middle School,3508595,46891989,2019-04-08,N,0.0
5048,Surfing 101 Middle School,3508595,46891989,2019-04-15,N,0.0
5049,Surfing 101 Middle School,3508595,46891989,2019-04-22,Y,225.0
5050,Surfing 101 Middle School,3508595,46891989,2019-04-29,Y,225.0


In [26]:
sem_2 = surf_101_MS_df[surf_101_MS_df["Activity Instance ID"]== 3508583 ]
sem_2["Date"].value_counts()

2019-03-18    11
2019-03-11    11
2019-03-04    11
2019-02-25    11
2019-02-18    11
2019-02-11    11
2019-02-04    11
2019-01-28    11
Name: Date, dtype: int64

In [27]:
sem_3 = surf_101_MS_df[surf_101_MS_df["Activity Instance ID"]== 3476979]
sem_3["Date"].value_counts()

2019-01-14    8
2019-01-07    8
2018-12-17    8
2018-12-10    8
2018-12-03    8
2018-11-26    8
2018-11-19    8
2018-11-12    8
2018-11-05    8
Name: Date, dtype: int64

In [28]:
sem_4 = surf_101_MS_df[surf_101_MS_df["Activity Instance ID"]== 3383538]
sem_4["Date"].value_counts()


2018-10-01    10
2018-09-24    10
2018-09-17    10
2018-09-10    10
2018-10-29    10
2018-10-22    10
2018-10-15    10
Name: Date, dtype: int64

In [29]:
# Create New group the MS Surfing 101 Classes by Activity Instance ID
# Going to Create New DF's so I can sort by Activity Instance ID and see & Attended

# Need to group students by Activity Instance ID 
# Then pull the attended Y or N count and make a percentage


## Separating Code


In [30]:
#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model

yes = surf_101_MS_df[surf_101_MS_df["Attended"] == "Y"]
no = surf_101_MS_df[surf_101_MS_df["Attended"] == "N"]
yes.head()

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,2018-09-10,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,2018-09-17,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,2018-09-24,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,2018-10-01,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,2018-10-15,Y,360.0


In [31]:
# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

yes_attended.head()

Participant ID
46887178    13
46891897     7
46891989    11
46892101     5
46893742     5
Name: Attended, dtype: int64

In [32]:
no_attended.head()

Participant ID
46887178    2
46891989    5
46892101    4
46893742    3
46895808    3
Name: Attended, dtype: int64

In [33]:
#Validating Total Y Count  -- Checks out Participant 46887178 has 13 y and 2 N
surf_101_MS_df[surf_101_MS_df["Participant ID"] == 46887178]

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID,Date,Attended,Minutes Attended
0,Surfing 101 Middle School,3383538,46887178,2018-09-10,Y,360.0
1,Surfing 101 Middle School,3383538,46887178,2018-09-17,Y,360.0
2,Surfing 101 Middle School,3383538,46887178,2018-09-24,Y,360.0
3,Surfing 101 Middle School,3383538,46887178,2018-10-01,Y,360.0
4,Surfing 101 Middle School,3383538,46887178,2018-10-15,Y,360.0
5,Surfing 101 Middle School,3383538,46887178,2018-10-22,Y,360.0
6,Surfing 101 Middle School,3383538,46887178,2018-10-29,Y,240.0
4958,Surfing 101 Middle School,3508583,46887178,2019-01-28,N,0.0
4959,Surfing 101 Middle School,3508583,46887178,2019-02-04,N,0.0
4960,Surfing 101 Middle School,3508583,46887178,2019-02-11,Y,240.0


In [34]:
act_id_df = surf_101_MS_df.drop(columns = ["Date", "Attended", "Minutes Attended"])
act_id_df

Unnamed: 0,Activity Name,Activity Instance ID,Participant ID
0,Surfing 101 Middle School,3383538,46887178
1,Surfing 101 Middle School,3383538,46887178
2,Surfing 101 Middle School,3383538,46887178
3,Surfing 101 Middle School,3383538,46887178
4,Surfing 101 Middle School,3383538,46887178
5,Surfing 101 Middle School,3383538,46887178
6,Surfing 101 Middle School,3383538,46887178
7,Surfing 101 Middle School,3383538,46891897
8,Surfing 101 Middle School,3383538,46891897
9,Surfing 101 Middle School,3383538,46891897


In [35]:
# Make New DF to create new Surfing 101 Total Days Attended DF 
surf_101_MS_total_days_attended_df = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Attended_Y": yes_attended,
    "Attended_N": no_attended
})
surf_101_MS_total_days_attended_df

Unnamed: 0_level_0,Activity Name,Attended_Y,Attended_N
Participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
46887178,Surfing 101 Middle School,13,2.0
46891897,Surfing 101 Middle School,7,
46891989,Surfing 101 Middle School,11,5.0
46892101,Surfing 101 Middle School,5,4.0
46893742,Surfing 101 Middle School,5,3.0
46894320,Surfing 101 Middle School,16,
46895808,Surfing 101 Middle School,15,3.0
46896561,Surfing 101 Middle School,7,1.0
46897422,Surfing 101 Middle School,9,
46897487,Surfing 101 Middle School,7,


In [36]:
# Replace NaN values with 0 since they are students who did not miss an attendance
surf_101_MS_total_days_attended_df['Attended_N'] = surf_101_MS_total_days_attended_df['Attended_N'].fillna(0)
surf_101_MS_total_days_attended_df.head()

Unnamed: 0_level_0,Activity Name,Attended_Y,Attended_N
Participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
46887178,Surfing 101 Middle School,13,2.0
46891897,Surfing 101 Middle School,7,0.0
46891989,Surfing 101 Middle School,11,5.0
46892101,Surfing 101 Middle School,5,4.0
46893742,Surfing 101 Middle School,5,3.0


In [37]:
# Check and Change DTYPE to INT
surf_101_MS_total_days_attended_df.dtypes
surf_101_MS_total_days_attended_df["Attended_N"].astype(int)

Participant ID
46887178    2
46891897    0
46891989    5
46892101    4
46893742    3
46894320    0
46895808    3
46896561    1
46897422    0
46897487    0
46897549    2
46897577    2
46898292    3
46898738    1
46898818    0
46898869    0
46899499    2
46899585    5
46900269    4
46900329    1
46901108    2
46901198    1
46904597    1
46906175    1
46906440    2
46907807    5
46908206    3
46908729    6
Name: Attended_N, dtype: int64

In [None]:
yes = df1[df1["Attended"] == "Y"]
no = df1[df1["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df1_clean = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Activity Instance ID": '3383538',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df1_clean['Attended_N'] = df1_clean['Attended_N'].fillna(0)

df1_clean["Percent Attend"] = ((df1_clean["Attended_Y"])/(df1_clean["Attended_Y"]+ df1_clean["Attended_N"]))


In [53]:
# Create a for loop to loop through data and 

groupby_att = attendance_df.groupby("Activity Instance ID")
big_list = []

for activity, df in groupby_att:
    participant_ids = set(df["Participant ID"].tolist())
    for participant in participant_ids:
        record = {"Activity": activity, 
                "Participant ID": participant,
                 "Attendance": 1}
        big_list.append(record)
    
    
#     number_of_days = len(set(df["Date"].tolist()))
#     attended = df.groupby("Participant ID")["Attended"].value_counts()
#     print(attended)
#     print(number_of_days)
#     print(activity)
    #df["Percent Attend"] = ((df["Attended_Y"])/(df["Attended_Y"]+ df["Attended_N"]))
    #df.drop_duplicates(subset = ["Participant ID"], inplace= True)
    #print(df)

In [58]:
new_attendance_df = pd.DataFrame(big_list)
new_attendance_df.groupby("Participant ID")["Attendance"].sum()
# Need to create a new dataframe with the sum included 

Unnamed: 0,Activity,Attendance,Participant ID,Total Programs Attended
0,3383538,1,46906440,
1,3383538,1,46897577,
2,3383538,1,46887178,
3,3383538,1,46897549,
4,3383538,1,46897487,
5,3383538,1,46894320,
6,3383538,1,46898292,
7,3383538,1,46891989,
8,3383538,1,46898869,
9,3383538,1,46891897,


In [60]:
# groupby_att = attendance_df.groupby("Activity Instance ID")
# big_list = []

# for activity, df in groupby_att:


#     number_of_days = len(set(df["Date"].tolist()))
#     attended = df.groupby("Participant ID")["Attended"].value_counts()
#     print(attended)
#     print(number_of_days)
#     print(activity)
#     #df["Percent Attend"] = ((df["Attended_Y"])/(df["Attended_Y"]+ df["Attended_N"]))
#     #df.drop_duplicates(subset = ["Participant ID"], inplace= True)
#     #print(df)

Participant ID  Attended
46887178        Y           7
46891897        Y           7
46891989        Y           7
46894320        Y           7
46897487        Y           7
46897549        Y           6
                N           1
46897577        Y           5
                N           2
46898292        Y           7
46898869        Y           7
46906440        Y           7
Name: Attended, dtype: int64
7
3383538
Participant ID  Attended
46843252        Y           80
                N           20
46846095        N           64
                Y           36
46847012        Y           40
                N            1
46848962        Y           69
                N           31
46854847        Y           53
                N           47
46855204        Y           76
                N           24
46857181        Y           77
                N           23
46857186        Y           67
                N           33
46857740        Y           67
                N       

## Separate Activity Instance ID into different groups 


In [38]:
act_id = attendance_df["Activity Instance ID"].tolist()
act_id

[3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3383538,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,
 3384580,


In [39]:
#https://www.w3schools.com/python/python_howto_remove_duplicates.asp
# Create a list of the possible unique values in Acitivity Instance ID
# Hoping to filter the DF using this list 

act_id = list(dict.fromkeys(poop))
len(act_id)


51

In [40]:
# Validate list length 
attendance_df["Activity Instance ID"].nunique()

51

In [41]:
# Split up Data according to Activity Instance ID into new DataFrames 
# Need to do this to be able to convert y and N to digits to get percentage of potential classes attended
# All classes are different amounts of days so percentage needs to be metric used rather than total classes attended

df6 = attendance_df[attendance_df["Activity Instance ID"]== 3420508]
df7 = attendance_df[attendance_df["Activity Instance ID"]== 3421146]
df8 = attendance_df[attendance_df["Activity Instance ID"]== 3429906]
df9 = attendance_df[attendance_df["Activity Instance ID"]== 3440566]
df10 = attendance_df[attendance_df["Activity Instance ID"]== 3440661]
df11 = attendance_df[attendance_df["Activity Instance ID"]== 3454339]
df12 = attendance_df[attendance_df["Activity Instance ID"]== 3472807]
df13= attendance_df[attendance_df["Activity Instance ID"]== 3473499]
df14 = attendance_df[attendance_df["Activity Instance ID"]== 3474121]
df15 = attendance_df[attendance_df["Activity Instance ID"]== 3474149]
df16=attendance_df[attendance_df["Activity Instance ID"]== 3474515]
df17=attendance_df[attendance_df["Activity Instance ID"]== 3475642]
df18=attendance_df[attendance_df["Activity Instance ID"]== 3476343]
df19=attendance_df[attendance_df["Activity Instance ID"]== 3476373]
df20=attendance_df[attendance_df["Activity Instance ID"]== 3476388]
df21=attendance_df[attendance_df["Activity Instance ID"]== 3476438]
df22=attendance_df[attendance_df["Activity Instance ID"]== 3476979]
df23=attendance_df[attendance_df["Activity Instance ID"]== 3480978]
df24=attendance_df[attendance_df["Activity Instance ID"]== 3488181]
df25=attendance_df[attendance_df["Activity Instance ID"]== 3499217]
df26=attendance_df[attendance_df["Activity Instance ID"]== 3499320]
df27=attendance_df[attendance_df["Activity Instance ID"]== 3499367]
df28=attendance_df[attendance_df["Activity Instance ID"]== 3499380]
df29=attendance_df[attendance_df["Activity Instance ID"]== 3508583]
df30=attendance_df[attendance_df["Activity Instance ID"]== 3508595]
df31=attendance_df[attendance_df["Activity Instance ID"]== 3517784]
df32=attendance_df[attendance_df["Activity Instance ID"]== 3526992]
df33=attendance_df[attendance_df["Activity Instance ID"]== 3527022]
df34=attendance_df[attendance_df["Activity Instance ID"]== 3527068]
df35=attendance_df[attendance_df["Activity Instance ID"]== 3528219]
df36=attendance_df[attendance_df["Activity Instance ID"]== 3528332]
df37=attendance_df[attendance_df["Activity Instance ID"]== 3528375]
df38=attendance_df[attendance_df["Activity Instance ID"]== 3535885]
df39=attendance_df[attendance_df["Activity Instance ID"]== 3535966]
df40=attendance_df[attendance_df["Activity Instance ID"]== 3536052]
df41=attendance_df[attendance_df["Activity Instance ID"]== 3538008]
df42=attendance_df[attendance_df["Activity Instance ID"]== 3538029]
df43=attendance_df[attendance_df["Activity Instance ID"]== 3620598]
df44=attendance_df[attendance_df["Activity Instance ID"]== 3620600]
df45=attendance_df[attendance_df["Activity Instance ID"]== 3620656]
df46=attendance_df[attendance_df["Activity Instance ID"]== 3620940]
df47=attendance_df[attendance_df["Activity Instance ID"]== 3620979]
df48=attendance_df[attendance_df["Activity Instance ID"]== 3621415]
df49=attendance_df[attendance_df["Activity Instance ID"]== 3621418]
df50=attendance_df[attendance_df["Activity Instance ID"]== 3621419]
df51=attendance_df[attendance_df["Activity Instance ID"]== 3621424]


In [43]:
#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model
#
df1 = attendance_df[attendance_df["Activity Instance ID"]== 3383538]
yes = df1[df1["Attended"] == "Y"]
no = df1[df1["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df1_clean = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Activity Instance ID": '3383538',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df1_clean['Attended_N'] = df1_clean['Attended_N'].fillna(0)

df1_clean["Percent Attend"] = ((df1_clean["Attended_Y"])/(df1_clean["Attended_Y"]+ df1_clean["Attended_N"]))

df1_clean

Unnamed: 0_level_0,Activity Name,Activity Instance ID,Attended_Y,Attended_N,Percent Attend
Participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
46887178,Surfing 101 Middle School,3383538,7,0.0,1.0
46891897,Surfing 101 Middle School,3383538,7,0.0,1.0
46891989,Surfing 101 Middle School,3383538,7,0.0,1.0
46894320,Surfing 101 Middle School,3383538,7,0.0,1.0
46897487,Surfing 101 Middle School,3383538,7,0.0,1.0
46897549,Surfing 101 Middle School,3383538,6,1.0,0.857143
46897577,Surfing 101 Middle School,3383538,5,2.0,0.714286
46898292,Surfing 101 Middle School,3383538,7,0.0,1.0
46898869,Surfing 101 Middle School,3383538,7,0.0,1.0
46906440,Surfing 101 Middle School,3383538,7,0.0,1.0


In [None]:
#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model
df2 = attendance_df[attendance_df["Activity Instance ID"]== 3384580]
yes = df2[df2["Attended"] == "Y"]
no = df2[df2["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df2_clean = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Activity Instance ID": '3384580',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df2_clean['Attended_N'] = df2_clean['Attended_N'].fillna(0)

# 
df2_clean["Percent Attend"] = ((df2_clean["Attended_Y"])/(df2_clean["Attended_Y"]+ df2_clean["Attended_N"]))



In [None]:

df3 = attendance_df[attendance_df["Activity Instance ID"]== 3417045]

#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model

yes = df3[df3["Attended"] == "Y"]
no = df3[df3["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df3_clean = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Activity Instance ID": '3417045',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df3_clean['Attended_N'] = df3_clean['Attended_N'].fillna(0)

# 
df3_clean["Percent Attend"] = ((df3_clean["Attended_Y"])/(df3_clean["Attended_Y"]+ df3_clean["Attended_N"]))





In [None]:
##4
#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model

df4 = attendance_df[attendance_df["Activity Instance ID"]== 3417059]
yes = df4[df4["Attended"] == "Y"]
no = df4[df4["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df4_clean = pd.DataFrame({
    "Activity Name": "Surfing 101 Middle School",
    "Activity Instance ID": '3417059',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df4_clean['Attended_N'] = df4_clean['Attended_N'].fillna(0)

# 
df4_clean["Percent Attend"] = ((df4_clean["Attended_Y"])/(df4_clean["Attended_Y"]+ df4_clean["Attended_N"]))


In [None]:

##5
df5 = attendance_df[attendance_df["Activity Instance ID"]== 3420417]
#Sort Participant ID by Total Attended Y or N so we can count the total number of days attended 
# Machine Learning Model

yes = df5[df5["Attended"] == "Y"]
no = df5[df5["Attended"] == "N"]


# YES Counts
yes_attended = yes.groupby(["Participant ID"]).count()["Attended"]
no_attended = no.groupby(["Participant ID"]).count()["Attended"].round(0)

# Merge together to create new Surfing 101 Total Days Attended DF 
df5_clean = pd.DataFrame({
    "Activity Name": "",
    "Activity Instance ID": '3420417',
    "Attended_Y": yes_attended,
    "Attended_N": no_attended,
    
})


# Replace NaN values with 0 since they are students who did not miss an attendance
df5_clean['Attended_N'] = df5_clean['Attended_N'].fillna(0)

# 
df5_clean["Percent Attend"] = ((df5_clean["Attended_Y"])/(df5_clean["Attended_Y"]+ df5_clean["Attended_N"]))

