# Chapter 8: Reshaping and tidying data

## Technical Requirements

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/academic.csv')
df.head()

In [None]:
from polars import selectors as cs
df = (
    df
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018)
)

In [None]:
df.head()

## Turning columns into rows

### How to do it...

In [None]:
long_df = df.melt(
    id_vars='academic_year', 
    value_vars=[
        'students',
        'us_students',
        'undergraduate',
        'graduate',
        'non_degree',
        'opt'
    ],
    variable_name='student_type',
    value_name='count'
)
long_df.head()

In [None]:
long_df.select('student_type').unique()

In [None]:
df.melt(
    id_vars='academic_year', 
    value_vars=cs.numeric()
).head()

In [None]:
lf = df.lazy()

In [None]:
(
    lf
    .melt(
        id_vars='academic_year', 
        value_vars=cs.numeric(),
        variable_name='student_type',
        value_name='count'
    )
    .collect()
    .head()
)

## Turning rows into columns

### Getting ready

In [None]:
from polars import selectors as cs

long_df = (
    pl.read_csv('../data/academic.csv')
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(
        pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018
    )
    .melt(
        id_vars='academic_year', 
        value_vars=[
            'students',
            'us_students',
            'undergraduate',
            'graduate',
            'non_degree',
            'opt'
        ],
        variable_name='student_type',
        value_name='count'
    
    )
)

In [None]:
long_df.head()

### How to do it...

In [None]:
(
    long_df
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type'
    )
)

In [None]:
(
    long_df
    .group_by('academic_year')
    .agg(
        pl.col('count').filter(pl.col('student_type')=='students').sum().alias('students'),
        pl.col('count').filter(pl.col('student_type')=='us_students').sum().alias('us_students'),
        pl.col('count').filter(pl.col('student_type')=='undergraduate').sum().alias('undergraduate'),
        pl.col('count').filter(pl.col('student_type')=='graduate').sum().alias('graduate'),
        pl.col('count').filter(pl.col('student_type')=='non_degree').sum().alias('non_degree'),
        pl.col('count').filter(pl.col('student_type')=='opt').sum().alias('opt'),
    )
)

In [None]:
student_types = [ 
    col for col in long_df.select('student_type').unique().to_series().to_list() 
] 

agg_cols = [ 
    ( 
        pl.col('count') 
        .filter(pl.col('student_type')==stu_type) 
        .sum() 
        .alias(stu_type)  
    ) 
    for stu_type in student_types 
] 

( 
    long_df 
    .group_by('academic_year') 
    .agg(agg_cols) 
) 

In [None]:
long_df.unstack(step=5, columns='count', how='vertical')

### There is more...

In [None]:
wide_df_with_dups = (
    pl.concat([
        df.head(1),
        df
    ])
)

wide_df_with_dups

In [None]:
long_df_with_dups = (
    wide_df_with_dups
    .melt(
        id_vars='academic_year', 
        value_vars=[
            'students',
            'us_students',
            'undergraduate',
            'graduate',
            'non_degree',
            'opt'
        ],
        variable_name='student_type',
        value_name='count'
    )
)

In [None]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type'
    )
)

In [None]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type',
        aggregate_function='min'
    )
)

In [None]:
(
    long_df_with_dups
    .pivot(
        index='academic_year', 
        values='count', 
        columns='student_type',
        aggregate_function=pl.element()
    )
)

In [None]:
agg_cols = [ 
    ( 
        pl.col('count') 
        .filter(pl.col('student_type')==stu_type) 
        .alias(stu_type)
    )
    for stu_type in student_types
]

(
    long_df_with_dups
    .group_by('academic_year')
    .agg(agg_cols)
)


## Joining DataFrames

### Getting ready

In [None]:
from polars import selectors as cs

academic_df = (
    pl.read_csv('../data/academic.csv')
    .select(
        pl.col('year').alias('academic_year'), 
        cs.numeric().cast(pl.Int64)
    )
    .filter(pl.col('academic_year').str.slice(0,4).cast(pl.Int32)>=2018)
)

In [None]:
academic_df

In [None]:
status_df = (
    pl.read_csv('../data/status.csv')
    .with_columns(
        cs.float().cast(pl.Int64)
    )
)
status_df

### How to do it...

In [337]:
joined_df = (
    academic_df
    .join(
        status_df,
        left_on='academic_year',
        right_on='year',
        how='inner'
    )
    .select(
        'academic_year',
        'students',
        cs.contains('visa')
    )
)
joined_df.head()

academic_year,students,visa_f,visa_j,visa_other
str,i64,i64,i64,i64
"""2018/19""",1095299,1018628,44907,31764
"""2019/20""",1075496,1000211,44095,31190
"""2020/21""",914095,860163,16454,37478
"""2021/22""",948519,884020,27507,36992
"""2022/23""",1057188,989528,34887,32773


In [360]:
import plotly.express as px

viz_df = (
    joined_df
    .melt(
        id_vars=['academic_year', 'students'],
        value_vars=cs.contains('visa'),
        variable_name='visa_type',
        value_name='count'
    )
)

fig = px.bar(
    x=viz_df['academic_year'], 
    y=viz_df['count'], 
    color=viz_df['visa_type'], 
    barmode = 'stack',
    title='International Student Count by Visa Type',
    labels={
        'x': 'Year',
        'y': 'Student Count'
    }
)
fig.show()


In [354]:
status_long_df = (
    status_df
    .melt(
        id_vars='year',
        value_vars=cs.contains('visa'),
        variable_name='visa_type',
        value_name='count'
    )
)
status_long_df.head()

year,visa_type,count
str,str,i64
"""2007/08""","""visa_f""",552691
"""2008/09""","""visa_f""",589007
"""2009/10""","""visa_f""",612158
"""2010/11""","""visa_f""",645163
"""2011/12""","""visa_f""",688810


In [359]:
(
    academic_df
    .join(
        status_long_df,
        left_on='academic_year',
        right_on='year',
        how='inner',
        validate='1:m'
    )
    .select(
        'academic_year',
        'students',
        'visa_type',
        'count'
    )
)

academic_year,students,visa_type,count
str,i64,str,i64
"""2018/19""",1095299,"""visa_f""",1018628
"""2019/20""",1075496,"""visa_f""",1000211
"""2020/21""",914095,"""visa_f""",860163
"""2021/22""",948519,"""visa_f""",884020
"""2022/23""",1057188,"""visa_f""",989528
"""2018/19""",1095299,"""visa_j""",44907
"""2019/20""",1075496,"""visa_j""",44095
"""2020/21""",914095,"""visa_j""",16454
"""2021/22""",948519,"""visa_j""",27507
"""2022/23""",1057188,"""visa_j""",34887


In [362]:
(
    academic_df
    .join(
        status_long_df,
        left_on='academic_year',
        right_on='year',
        how='inner',
        validate='1:1'
    )
    .select(
        'academic_year',
        'students',
        'visa_type',
        'count'
    )
)

ComputeError: the join keys did not fulfil 1:1 validation

### There is more...

In [375]:
a = pl.DataFrame({
    'int': [1,2,3], 'value': [10,20,30]
}).set_sorted('int')

b = pl.DataFrame({
    'int': [4,5,6]
}).set_sorted('int')

b.join_asof(a, on='int', strategy='backward')

int,value
i64,i64
4,30
5,30
6,30
