# Chapter 8: Reshaping and tidying data

## Technical Requirements

In [1]:
import polars as pl

In [2]:
df = pl.read_csv('../data/academic.csv')
df.head()

year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,f64,f64,f64,f64,f64
"""1948/49""",25464,2403400.0,,,,
"""1949/50""",26433,2445000.0,,,,
"""1950/51""",29813,2281000.0,,,,
"""1951/52""",30462,2102000.0,,,,
"""1952/53""",33675,2134000.0,,,,


In [3]:
from polars import selectors as cs
df = (
    df
    .select(
        pl.col('year').alias('academic_year'), 
        cs.float().cast(pl.Int64)
    )
    .filter(pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018)
)

In [4]:
df.head()

academic_year,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64
"""2018/19""",19828000,431930,377943,62341,223085
"""2019/20""",19720000,419321,374435,58201,223539
"""2020/21""",19744000,359787,329272,21151,203885
"""2021/22""",20327000,344532,385097,34131,184759
"""2022/23""",18961280,347602,467027,43766,198793


## Turning columns into rows

### How to do it...

In [17]:
long_df = df.melt(
    id_vars='academic_year', 
    value_vars=[
        'us_students',
        'undergraduate',
        'graduate',
        'non_degree',
        'opt'
    ],
    variable_name='student_type',
    value_name='count'
)
long_df.head()

academic_year,student_type,count
str,str,i64
"""2018/19""","""us_students""",19828000
"""2019/20""","""us_students""",19720000
"""2020/21""","""us_students""",19744000
"""2021/22""","""us_students""",20327000
"""2022/23""","""us_students""",18961280


In [19]:
long_df.select('student_type').unique()

student_type
str
"""opt"""
"""non_degree"""
"""undergraduate"""
"""us_students"""
"""graduate"""


In [23]:
df.melt(
    id_vars='academic_year', 
    value_vars=cs.numeric()
).head()

academic_year,variable,value
str,str,i64
"""2018/19""","""us_students""",19828000
"""2019/20""","""us_students""",19720000
"""2020/21""","""us_students""",19744000
"""2021/22""","""us_students""",20327000
"""2022/23""","""us_students""",18961280


In [20]:
lf = df.lazy()

In [24]:
(
    lf
    .melt(
        id_vars='academic_year', 
        value_vars=cs.numeric(),
        variable_name='student_type',
        value_name='count'
    )
    .collect()
    .head()
)

academic_year,student_type,count
str,str,i64
"""2018/19""","""us_students""",19828000
"""2019/20""","""us_students""",19720000
"""2020/21""","""us_students""",19744000
"""2021/22""","""us_students""",20327000
"""2022/23""","""us_students""",18961280
