# Chapter 4: Data Transformation Techniques

## Simple aggregations

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)

In [None]:
df.head()

In [None]:
from polars import selectors as cs
(
    df
    .select(cs.numeric())
    .sum()
)

In [None]:
s = df.select('Quantity').to_series()
s.sum()

In [None]:
df.select(pl.col('Quantity').sum())

In [None]:
df.select(
    pl.col('Customer Name').first().alias('Cust Name First'),
    pl.col('Customer Name').last().alias('Cust Name Last')
)

In [None]:
df.select(cs.numeric()).describe()

### There is more...

In [None]:
df.select(
    (pl.col('Quantity') >= 4).sum()
)

In [None]:
df.select(
    pl.col('Quantity').filter(pl.col('Store Name')=='Online store').sum()
)

## Using group by aggregations

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

In [None]:
df.group_by('Brand')

In [None]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity').sum().alias('Sum of Quantity'))
    .head()
)

In [None]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').sum().alias('Sum of Quantity'),
        pl.col('Unit Price').mean().alias('Average Unit Price'),
    )
    .sort('Average Unit Price', descending=True)
    .head()
)

In [None]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.count()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .head()
)

In [None]:
(
    pl.scan_csv('../data/contoso_sales.csv', try_parse_dates=True)
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.count()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .collect()
    .head()
)

In [None]:
pl.Config.set_fmt_str_lengths = 50
print(df.select('Brand').unique().head(10))

In [None]:
df.select('Brand').unique().head(10)

In [None]:
import os
os.environ['POLARS_FMT_STR_LEN'] = str(50)

df.select('Brand').unique().head(10)

### There is more...

In [None]:
for name, data in df.group_by('Brand'):
    print(name, type(data))

In [None]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity'))
    .head()
)

In [None]:
(
    df
    .group_by('Brand', maintain_order=True)
    .agg(pl.col('Quantity'))
    .head()
)

In [None]:
(
    df
    .group_by(
        pl.col('Brand'), 
        'Customer Country',
        pl.col('Order Date').dt.year().alias('Order Year')
        )
    .agg(pl.col('Unit Price').mean())
    .head()
)

## Aggregating values across multiple columns

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/pokemon.csv')
df.head()

In [None]:
(
    df
    .select('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed')
    .sum(axis=1).alias('Total 2')
    .head(5)
)

In [None]:
(
    df
    .with_columns(
        pl.sum_horizontal('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').alias('Total 2')
    )
    .head(5)
)

In [None]:
(
    df
    .with_columns(
        pl.concat_list('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').list.sum().alias('Total 2')
    )
    .head(5)
)

In [None]:
cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
(
    df
    .with_columns(
        pl.reduce(
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

In [None]:
(
    df
    .with_columns(
        pl.fold(
            acc=pl.lit(100), 
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

### There is more...

In [None]:
(
    df
    .filter(
        pl.fold(
            acc=pl.lit(True), 
            function=lambda acc, col: acc & col, 
            exprs=pl.col(cols) > 80
        )
    )
    .head()
)

In [None]:
(
    df
    .filter(
        pl.all_horizontal(pl.col(cols) > 80)
    )
    .head()
)

In [None]:
str_cols = ['Name', 'Type 1', 'Type 2']
str_combined = pl.fold(acc=pl.lit(''), function=lambda acc, col: acc + col, exprs=str_cols).alias('Str Combined')
str_cols.append(str_combined)
df.select(str_cols).head()

In [None]:
str_cols = ['Name', 'Type 1', 'Type 2']
df.select(pl.concat_str(str_cols)).head()

## Computing over groups with window functions

### How to do it...

In [None]:
import polars as pl

In [None]:
import os 
os.environ['POLARS_FMT_STR_LEN'] = str(50) 

In [None]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df = df.with_columns(
    (pl.col('Quantity') * pl.col('Net Price')).round(2).alias('Sales Amount')
)
df.head()

In [None]:
sales_by_cat = df.select(
    'Category',
    'Subcategory',
    pl.col('Sales Amount').sum().over('Category').alias('Sales Amt per Cat')
)
sales_by_cat.head()    

In [None]:
sales_by_cat.filter(pl.col('Category')=='Audio').unique().head()

In [None]:
df.shape, sales_by_cat.shape 

In [None]:
(
    df
    .select(
        'Category',
        'Brand',
        'Subcategory',
        pl.col('Sales Amount').mean().over('Category', 'Brand').alias('Avg Sales per Cat and Brand')
    )
    .filter(
        (pl.col('Category')=='Computers') 
    )
    .unique()
    .sort('Brand')
    .head(10)
)

In [None]:
from datetime import date

curr_yr = date.today().year
cust_birth_yr = curr_yr - pl.col('Customer Age')

(
    df
    .select(
        'Category',
        'Brand',
        'Customer Age',
        pl.col('Sales Amount').mean().over('Category', cust_birth_yr).alias('Avg Sales per Cat') 
    )
    .filter(pl.col('Category')=='Computers')
    .unique()
    .sort('Customer Age')
    .head(10)
)

In [None]:
(
    df
    .group_by('Category')
    .agg(pl.col('Sales Amount').max().alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).alias('Rank')
    )
    .sort('Rank')
)

In [None]:
(
    df
    .group_by('Category', 'Subcategory')
    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')
    )
    .filter(pl.col('Category').is_in(['Audio', 'Computers']))
    .sort(['Category', 'Rank'])
)

### There is more...

In [None]:
max_sales_rank = (
    df
    .group_by('Category', 'Subcategory')
    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')
    )
    .filter(pl.col('Category').is_in(['Audio', 'Computers']))
    .sort(['Category', 'Rank'])
)

In [None]:
max_sales_rank.with_columns(
    pl.col('Subcategory')
    .sort_by('Max Sales Amt')
    .head(3)
    .over('Category', mapping_strategy='join')
    .alias('Lowest 3 Subcat per Cat')
)

In [None]:
max_sales_rank.with_columns(
    pl.col('Subcategory')
    .sort_by('Max Sales Amt')
    .over('Category', mapping_strategy='explode')
    .alias('Subcategory Sorted by Max Sales Amt Ascending')
)

In [None]:
(
    max_sales_rank
    .sort('Subcategory')
    .with_columns(
        pl.col('Subcategory')
        .sort_by('Max Sales Amt')
        .over('Category', mapping_strategy='explode')
        .alias('Subcategory Sorted by Max Sales Amt Ascending')
    )
)

## Applying UDFs

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

In [None]:
def get_first_name(full_name: str) -> str:
    return full_name.split(' ')[0]

In [None]:
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: get_first_name(el)).alias('Customer First Name')
).head()

In [None]:
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0]).alias('Customer First Name')
).head()

In [None]:
def age_to_range(age: int) -> str:
    if age < 18:
        return '~17'
    elif age <= 30:
        return '18~30'
    elif age <= 50:
        return '31~50'
    elif age <= 70:
        return '50~70'
    elif age > 70:
        return '71~'

In [None]:
df.select(
    'Customer Age',
    pl.col('Customer Age').map_elements(lambda el: age_to_range(el)).alias('Age Range')
).head()

### There is more...

In [None]:
df.select(
    'Customer Name',
    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')
).head()

In [None]:
df.select(
    'Customer Age',
    pl.when(pl.col('Customer Age')<18).then(pl.lit('~17'))
    .when(pl.col('Customer Age')<=30).then(pl.lit('18~30'))
    .when(pl.col('Customer Age')<=50).then(pl.lit('31~50'))
    .when(pl.col('Customer Age')<=70).then(pl.lit('51~70'))
    .when(pl.col('Customer Age')>70).then(pl.lit('71~'))
    .alias('Age Range')
).head()

In [None]:
%%timeit
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0]).alias('Customer First Name')
).head()

In [None]:
%%timeit
df.select(
    'Customer Name',
    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')
).head()

## Using SQL for data transformations

In [2]:
import polars as pl
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store …","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


### How to do it...

In [35]:
ctx = pl.SQLContext(eager_execution=True)
ctx.register('df', df)
ctx.execute(
    """
      select
        `Customer Name`,
        Brand,
        Category
      from df limit 5
    """
)


Customer Name,Brand,Category
str,str,str
"""Eric Kennedy""","""Contoso""","""Audio"""
"""George Tooth""","""Contoso""","""Audio"""
"""Caleb Greene""","""Contoso""","""Audio"""
"""Isaac Siddins""","""Contoso""","""Audio"""
"""Mike McQueen""","""Contoso""","""Audio"""


In [36]:
ctx.execute(
    """
      select
        Brand,
        avg(Quantity) as `Avg Quantity` 
      from df
      group by 
        Brand
      order by 
        `Avg Quantity` desc
      limit 5
    """
)

Brand,Avg Quantity
str,f64
"""Fabrikam""",3.225532
"""Northwind Trad…",3.222222
"""Wide World Imp…",3.193811
"""Fabrikam """,3.192308
"""Southridge Vid…",3.189509


In [37]:
pl.SQLContext(lf=df.lazy()).execute(
    """
        select 
            Brand,
            Category
        from lf
        limit 5
    """
).collect()

Brand,Category
str,str
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
