# Chapter 8: Reshaping and Tidying Data

## Technical Requirements

In [20]:
import polars as pl

In [21]:
df = pl.read_csv('../data/academic.csv')
df.head()

year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,f64,f64,f64,f64,f64
"""1948/49""",25464,2403400.0,,,,
"""1949/50""",26433,2445000.0,,,,
"""1950/51""",29813,2281000.0,,,,
"""1951/52""",30462,2102000.0,,,,
"""1952/53""",33675,2134000.0,,,,


In [22]:
from polars import selectors as cs
df = (
    df
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018)
)

In [23]:
df.head()

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


## Turning columns into rows

### How to do it...

In [24]:
long_df = df.melt(
    id_vars='academic_year', 
    value_vars=[
        'students',
        'us_students',
        'undergraduate',
        'graduate',
        'non_degree',
        'opt'
    ],
    variable_name='student_type',
    value_name='count'
)
long_df.head()

academic_year,student_type,count
str,str,i64
"""2018/19""","""students""",1095299
"""2019/20""","""students""",1075496
"""2020/21""","""students""",914095
"""2021/22""","""students""",948519
"""2022/23""","""students""",1057188


In [25]:
long_df.select('student_type').unique()

student_type
str
"""students"""
"""us_students"""
"""undergraduate"""
"""non_degree"""
"""graduate"""
"""opt"""


In [26]:
df.melt(
    id_vars='academic_year', 
    value_vars=cs.numeric()
).head()

academic_year,variable,value
str,str,i64
"""2018/19""","""students""",1095299
"""2019/20""","""students""",1075496
"""2020/21""","""students""",914095
"""2021/22""","""students""",948519
"""2022/23""","""students""",1057188


In [27]:
lf = df.lazy()

In [28]:
(
    lf
    .melt(
        id_vars='academic_year', 
        value_vars=cs.numeric(),
        variable_name='student_type',
        value_name='count'
    )
    .collect()
    .head()
)

academic_year,student_type,count
str,str,i64
"""2018/19""","""students""",1095299
"""2019/20""","""students""",1075496
"""2020/21""","""students""",914095
"""2021/22""","""students""",948519
"""2022/23""","""students""",1057188


## Turning rows into columns

### Getting ready

In [29]:
from polars import selectors as cs

long_df = (
    pl.read_csv('../data/academic.csv')
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(
        pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018
    )
    .melt(
        id_vars='academic_year', 
        value_vars=[
            'students',
            'us_students',
            'undergraduate',
            'graduate',
            'non_degree',
            'opt'
        ],
        variable_name='student_type',
        value_name='count'
    
    )
)

In [30]:
long_df.head()

academic_year,student_type,count
str,str,i64
"""2018/19""","""students""",1095299
"""2019/20""","""students""",1075496
"""2020/21""","""students""",914095
"""2021/22""","""students""",948519
"""2022/23""","""students""",1057188


### How to do it...

In [31]:
(
    long_df
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type'
    )
)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [32]:
(
    long_df
    .group_by('academic_year')
    .agg(
        pl.col('count').filter(pl.col('student_type')=='students').sum().alias('students'),
        pl.col('count').filter(pl.col('student_type')=='us_students').sum().alias('us_students'),
        pl.col('count').filter(pl.col('student_type')=='undergraduate').sum().alias('undergraduate'),
        pl.col('count').filter(pl.col('student_type')=='graduate').sum().alias('graduate'),
        pl.col('count').filter(pl.col('student_type')=='non_degree').sum().alias('non_degree'),
        pl.col('count').filter(pl.col('student_type')=='opt').sum().alias('opt'),
    )
)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2022/23""",1057188,18961280,347602,467027,43766,198793
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759


In [33]:
student_types = [ 
    col for col in long_df.select('student_type').unique().to_series().to_list() 
] 

agg_cols = [ 
    ( 
        pl.col('count') 
        .filter(pl.col('student_type')==stu_type) 
        .sum() 
        .alias(stu_type)  
    ) 
    for stu_type in student_types 
] 

( 
    long_df 
    .group_by('academic_year') 
    .agg(agg_cols) 
) 

academic_year,graduate,us_students,students,undergraduate,opt,non_degree
str,i64,i64,i64,i64,i64,i64
"""2019/20""",374435,19720000,1075496,419321,223539,58201
"""2021/22""",385097,20327000,948519,344532,184759,34131
"""2022/23""",467027,18961280,1057188,347602,198793,43766
"""2020/21""",329272,19744000,914095,359787,203885,21151
"""2018/19""",377943,19828000,1095299,431930,223085,62341


In [34]:
long_df.unstack(step=5, columns='count', how='vertical')

count_0,count_1,count_2,count_3,count_4,count_5
i64,i64,i64,i64,i64,i64
1095299,19828000,431930,377943,62341,223085
1075496,19720000,419321,374435,58201,223539
914095,19744000,359787,329272,21151,203885
948519,20327000,344532,385097,34131,184759
1057188,18961280,347602,467027,43766,198793


### There is more...

In [35]:
wide_df_with_dups = (
    pl.concat([
        df.head(1),
        df
    ])
)

wide_df_with_dups

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [36]:
long_df_with_dups = (
    wide_df_with_dups
    .melt(
        id_vars='academic_year', 
        value_vars=[
            'students',
            'us_students',
            'undergraduate',
            'graduate',
            'non_degree',
            'opt'
        ],
        variable_name='student_type',
        value_name='count'
    )
)

In [37]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type'
    )
)

ComputeError: found multiple elements in the same group, please specify an aggregation function

In [38]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type',
        aggregate_function='min'
    )
)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [39]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type',
        aggregate_function=pl.element()
    )
)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
"""2018/19""","[1095299, 1095299]","[19828000, 19828000]","[431930, 431930]","[377943, 377943]","[62341, 62341]","[223085, 223085]"
"""2019/20""",[1075496],[19720000],[419321],[374435],[58201],[223539]
"""2020/21""",[914095],[19744000],[359787],[329272],[21151],[203885]
"""2021/22""",[948519],[20327000],[344532],[385097],[34131],[184759]
"""2022/23""",[1057188],[18961280],[347602],[467027],[43766],[198793]


In [40]:
agg_cols = [ 
    ( 
        pl.col('count') 
        .filter(pl.col('student_type')==stu_type) 
        .alias(stu_type)
    )
    for stu_type in student_types
]

(
    long_df_with_dups
    .group_by('academic_year')
    .agg(agg_cols)
)


academic_year,graduate,us_students,students,undergraduate,opt,non_degree
str,list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
"""2019/20""",[374435],[19720000],[1075496],[419321],[223539],[58201]
"""2022/23""",[467027],[18961280],[1057188],[347602],[198793],[43766]
"""2020/21""",[329272],[19744000],[914095],[359787],[203885],[21151]
"""2018/19""","[377943, 377943]","[19828000, 19828000]","[1095299, 1095299]","[431930, 431930]","[223085, 223085]","[62341, 62341]"
"""2021/22""",[385097],[20327000],[948519],[344532],[184759],[34131]


## Joining DataFrames

### Getting ready

In [41]:
from polars import selectors as cs

academic_df = (
    pl.read_csv('../data/academic.csv')
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018)
)

In [42]:
academic_df

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [43]:
status_df = (
    pl.read_csv('../data/status.csv')
    .with_columns(
        cs.float().cast(pl.Int64)
    )
)
status_df

year,female,male,single,married,full_time,part_time,visa_f,visa_j,visa_other
str,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""2007/08""",278841,344964,543958,79847,575772,48033,552691,31814,39300
"""2008/09""",304242,367374,591694,79922,613185,58431,589007,39625,42984
"""2009/10""",309534,381389,615612,75311,637722,53201,612158,38692,40073
"""2010/11""",322582,400695,653842,69435,669031,54246,645163,40504,37610
"""2011/12""",338671,425824,690339,74156,714038,50457,688810,42047,33638
"""2012/13""",363922,455722,744237,75407,769646,49998,747515,42621,29508
"""2013/14""",390749,495303,806307,79745,829345,56707,804535,49619,31898
"""2014/15""",426043,548883,886208,88718,912531,62395,881333,58496,35097
"""2015/16""",451982,591857,950937,92902,971814,72025,957200,52192,34447
"""2016/17""",470366,608456,970940,107882,1007620,71202,994674,45311,38837


### How to do it...

In [44]:
joined_df = (
    academic_df
    .join(
        status_df,
        left_on='academic_year',
        right_on='year',
        how='inner'
    )
    .select(
        'academic_year',
        'students',
        cs.contains('visa')
    )
)
joined_df.head()

academic_year,students,visa_f,visa_j,visa_other
str,i64,i64,i64,i64
"""2018/19""",1095299,1018628,44907,31764
"""2019/20""",1075496,1000211,44095,31190
"""2020/21""",914095,860163,16454,37478
"""2021/22""",948519,884020,27507,36992
"""2022/23""",1057188,989528,34887,32773


In [109]:
import plotly.express as px

viz_df = (
    joined_df
    .melt(
        id_vars=['academic_year', 'students'],
        value_vars=cs.contains('visa'),
        variable_name='visa_type',
        value_name='count'
    )
    .with_columns(
        (pl.col('count') / pl.col('students')).alias('percent_of_total')
    )
)

fig = px.bar(
    x=viz_df['academic_year'], 
    y=viz_df['percent_of_total'], 
    color=viz_df['visa_type'], 
    barmode = 'stack',
    text_auto='.1%',
    title='International Student Count by Visa Type',
    labels={
        'x': 'Year',
        'y': 'Student Count'
    },
)

fig.update_layout(
    autosize=True,
    uniformtext_minsize=10, 
    uniformtext_mode='hide',
    yaxis_tickformat = '0%'
)

fig.update_traces(textposition='inside')

fig.show()


In [46]:
status_long_df = (
    status_df
    .melt(
        id_vars='year',
        value_vars=cs.contains('visa'),
        variable_name='visa_type',
        value_name='count'
    )
)
status_long_df.head()

year,visa_type,count
str,str,i64
"""2007/08""","""visa_f""",552691
"""2008/09""","""visa_f""",589007
"""2009/10""","""visa_f""",612158
"""2010/11""","""visa_f""",645163
"""2011/12""","""visa_f""",688810


In [47]:
(
    academic_df
    .join(
        status_long_df,
        left_on='academic_year',
        right_on='year',
        how='inner',
        validate='1:m'
    )
    .select(
        'academic_year',
        'students',
        'visa_type',
        'count'
    )
)

academic_year,students,visa_type,count
str,i64,str,i64
"""2018/19""",1095299,"""visa_f""",1018628
"""2019/20""",1075496,"""visa_f""",1000211
"""2020/21""",914095,"""visa_f""",860163
"""2021/22""",948519,"""visa_f""",884020
"""2022/23""",1057188,"""visa_f""",989528
"""2018/19""",1095299,"""visa_j""",44907
"""2019/20""",1075496,"""visa_j""",44095
"""2020/21""",914095,"""visa_j""",16454
"""2021/22""",948519,"""visa_j""",27507
"""2022/23""",1057188,"""visa_j""",34887


In [48]:
(
    academic_df
    .join(
        status_long_df,
        left_on='academic_year',
        right_on='year',
        how='inner',
        validate='1:1'
    )
    .select(
        'academic_year',
        'students',
        'visa_type',
        'count'
    )
)

ComputeError: the join keys did not fulfil 1:1 validation

### There is more...

In [49]:
a = pl.DataFrame({
    'int': [1,2,3], 'value': [10,20,30]
}).set_sorted('int')

b = pl.DataFrame({
    'int': [4,5,6]
}).set_sorted('int')

b.join_asof(a, on='int', strategy='backward')

int,value
i64,i64
4,30
5,30
6,30


## Concatenating DataFrames

### Getting ready

In [50]:
academic_df

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [51]:
df1 = academic_df.head(2)
df2 = academic_df.slice(2, 2)
df3 = academic_df.tail(1)
display(df1, df2, df3)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539


academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759


academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2022/23""",1057188,18961280,347602,467027,43766,198793


### How to do it...

In [52]:
pl.concat(
    [df1, df2, df3],
    how='vertical'
)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [53]:
df1.vstack(df2).vstack(df3)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [54]:
pl.concat(
    [
        df1.select('academic_year'), 
        df2.select('students')
    ],
    how='horizontal'
)

academic_year,students
str,i64
"""2018/19""",914095
"""2019/20""",948519


In [55]:
pl.concat(
    [
        df1.select('academic_year'), 
        df2.select('students'),
        df3.select('us_students')
    ],
    how='horizontal'
)

academic_year,students,us_students
str,i64,i64
"""2018/19""",914095,18961280.0
"""2019/20""",948519,


In [56]:
(
    df1
    .select('academic_year')
    .hstack(df2.select('students'))
)

academic_year,students
str,i64
"""2018/19""",914095
"""2019/20""",948519


In [57]:
(
    df1
    .select('academic_year')
    .hstack(df2.select('students'))
    .hstack(df3.select('us_students'))
)

ShapeError: could not create a new DataFrame: series "academic_year" has length 2 while series "us_students" has length 1

### There is more...

In [58]:
df1.extend(df2).extend(df3)

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


In [59]:
df1

academic_year,students,us_students,undergraduate,graduate,non_degree,opt
str,i64,i64,i64,i64,i64,i64
"""2018/19""",1095299,19828000,431930,377943,62341,223085
"""2019/20""",1075496,19720000,419321,374435,58201,223539
"""2020/21""",914095,19744000,359787,329272,21151,203885
"""2021/22""",948519,20327000,344532,385097,34131,184759
"""2022/23""",1057188,18961280,347602,467027,43766,198793


## Other reshaping techniques

### How to do it...

In [125]:
academic_df.partition_by('academic_year')

[shape: (1, 7)
 ┌───────────────┬──────────┬─────────────┬───────────────┬──────────┬────────────┬────────┐
 │ academic_year ┆ students ┆ us_students ┆ undergraduate ┆ graduate ┆ non_degree ┆ opt    │
 │ ---           ┆ ---      ┆ ---         ┆ ---           ┆ ---      ┆ ---        ┆ ---    │
 │ str           ┆ i64      ┆ i64         ┆ i64           ┆ i64      ┆ i64        ┆ i64    │
 ╞═══════════════╪══════════╪═════════════╪═══════════════╪══════════╪════════════╪════════╡
 │ 2018/19       ┆ 1095299  ┆ 19828000    ┆ 431930        ┆ 377943   ┆ 62341      ┆ 223085 │
 └───────────────┴──────────┴─────────────┴───────────────┴──────────┴────────────┴────────┘,
 shape: (1, 7)
 ┌───────────────┬──────────┬─────────────┬───────────────┬──────────┬────────────┬────────┐
 │ academic_year ┆ students ┆ us_students ┆ undergraduate ┆ graduate ┆ non_degree ┆ opt    │
 │ ---           ┆ ---      ┆ ---         ┆ ---           ┆ ---      ┆ ---        ┆ ---    │
 │ str           ┆ i64      ┆ i64      

In [127]:
academic_df.transpose(include_header=True)

column,column_0,column_1,column_2,column_3,column_4
str,str,str,str,str,str
"""academic_year""","""2018/19""","""2019/20""","""2020/21""","""2021/22""","""2022/23"""
"""students""","""1095299""","""1075496""","""914095""","""948519""","""1057188"""
"""us_students""","""19828000""","""19720000""","""19744000""","""20327000""","""18961280"""
"""undergraduate""","""431930""","""419321""","""359787""","""344532""","""347602"""
"""graduate""","""377943""","""374435""","""329272""","""385097""","""467027"""
"""non_degree""","""62341""","""58201""","""21151""","""34131""","""43766"""
"""opt""","""223085""","""223539""","""203885""","""184759""","""198793"""


In [141]:
(
    academic_df
    .select(
        pl.col('academic_year', 'students').reshape((1, 5))
    )      
)

academic_year,students
list[str],list[i64]
"[""2018/19"", ""2019/20"", … ""2022/23""]","[1095299, 1075496, … 1057188]"
