In [112]:
import polars as pl

In [113]:
attendance_df = (
    pl.read_csv('data/Term*.csv')
    .with_columns(
        (pl.col('First Name') + ' ' + pl.col('Last Name')).alias('Full Name')
    )
)
attendance_df.head()

First Name,Last Name,Days Present,Days Absent,Full Name
str,str,i64,i64,str
"""Ronna""","""Nellies""",60,3,"""Ronna Nellies"""
"""Rusty""","""Andriulis""",55,8,"""Rusty Andriuli…"
"""Roberta""","""Oakeshott""",45,18,"""Roberta Oakesh…"
"""Lola""","""Rubinfajn""",43,20,"""Lola Rubinfajn…"
"""Kamila""","""Benedtti""",24,39,"""Kamila Benedtt…"


In [114]:
year_group_df = pl.read_csv('data/Year Groups.csv')
year_group_df.head()

First Name,Last Name,Year Group
str,str,i64
"""Ronna""","""Nellies""",3
"""Rusty""","""Andriulis""",2
"""Roberta""","""Oakeshott""",5
"""Lola""","""Rubinfajn""",3
"""Kamila""","""Benedtti""",7


In [115]:
attendance_yearly_df = (
    attendance_df
    .join(
        year_group_df, how='inner', on=['First Name', 'Last Name']
    )
    .group_by('Full Name', 'Year Group')
    .agg(
        pl.col('Days Present').sum(),
        pl.col('Days Absent').sum(),
        (
            pl.col('Days Present').sum() / 
            (pl.col('Days Present').sum() + pl.col('Days Absent').sum() ) *
            100
        ) 
        .round(2) 
        .alias('Year Attendance Rate')
    )
    .with_columns(
        pl.col('Year Attendance Rate').rank('dense', descending=True).alias('Rank')
    )
)
attendance_yearly_df.head()

Full Name,Year Group,Days Present,Days Absent,Year Attendance Rate,Rank
str,i64,i64,i64,f64,u32
"""Avery Colebour…",2,114,76,60.0,64
"""Valentino Klim…",6,134,56,70.53,44
"""King Truswell""",5,141,49,74.21,37
"""Den Masters""",7,142,48,74.74,36
"""Arabele Rosena…",7,109,81,57.37,69


In [116]:
best_attendance_rate = (
    attendance_yearly_df
    .filter(pl.col('Rank')==1)
    .select('Full Name', 'Year Attendance Rate', 'Rank')
)
best_attendance_rate

Full Name,Year Attendance Rate,Rank
str,f64,u32
"""Cinnamon Stoyl…",96.32,1
"""Chickie Asch""",96.32,1


In [117]:
top_5_perc_stu_cnt = ( pl.count().over('Year Group') )*0.05
top_5_perc_list_df =  (
    attendance_yearly_df
    .sort('Rank')
    .group_by('Year Group')
    .agg(
        pl.col('Year Group', 'Year Attendance Rate')
        .head(
            (pl.count() * 0.05)
            .ceil()
        )
    )
)
   
top_5_perc_list_df

Year Group,Year Attendance Rate
i64,list[f64]
2,"[92.11, 88.95, … 85.26]"
1,"[87.89, 86.84, … 84.21]"
7,"[96.32, 95.26, … 85.79]"
3,"[92.63, 92.11, … 80.0]"
5,"[92.11, 89.47, … 87.37]"
6,"[96.32, 91.58, … 86.32]"
4,"[91.58, 88.42, … 85.26]"


In [119]:
top_5_attendance = (
    attendance_yearly_df
    .join(top_5_perc_list_df, how='inner', on='Year Group', suffix=' List')
    .filter(
        pl.col('Year Attendance Rate').is_in(pl.col('Year Attendance Rate List'))
    )
    .select('Year Group', 'Full Name', 'Year Attendance Rate')
)
top_5_attendance

Year Group,Full Name,Year Attendance Rate
i64,str,f64
2,"""Tiffy Glanz""",86.84
5,"""Zarah Symingto…",89.47
3,"""Ronica Propper…",80.0
6,"""Nick Jurzyk""",91.58
5,"""Gil Hele""",87.37
7,"""Jami Plunket""",86.32
6,"""Elvin Lewinton…",86.84
4,"""Alane Beidebek…",88.42
6,"""Cinnamon Stoyl…",96.32
1,"""Leese Leinster…",84.74
