# Aggregations, Window Functions, and UDFs 

## Simple aggregations

### How to do it...

In [1]:
import polars as pl

In [56]:
df = pl.read_csv('../data/constoso_sales.csv', try_parse_dates=True)

In [57]:
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store …","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


In [58]:
from polars import selectors as cs
(
    df
    .select(cs.numeric())
    .sum()
)

Order Number,Line Number,Customer Age,Quantity,Unit Price,Net Price,Unit Cost,Exchange Rate
i64,i64,i64,i64,f64,f64,f64,f64
4466019052,16195,725757,43517,4178500.0,3928600.0,1735600.0,14124.4597


In [59]:
s = df.select('Quantity').to_series()
s.sum()

43517

In [60]:
df.select(pl.col('Quantity').sum())

Quantity
i64
43517


In [64]:
df.select(
    pl.col('Customer Name').first().alias('Cust Name First'),
    pl.col('Customer Name').last().alias('Cust Name Last')
)

Cust Name First,Cust Name Last
str,str
"""Eric Kennedy""","""Billy Ratliff"""


In [61]:
df.select(cs.numeric()).describe()

describe,Order Number,Line Number,Customer Age,Quantity,Unit Price,Net Price,Unit Cost,Exchange Rate
str,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",13915.0,13915.0,13915.0,13915.0,13915.0,13915.0,13915.0,13915.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",320949.985771,1.163852,52.15645,3.127345,300.28425,282.32739,124.731364,1.015053
"""std""",28431.79136,1.361349,19.133881,2.233597,405.538975,381.738847,147.944094,0.171927
"""min""",269500.0,0.0,19.0,1.0,0.95,0.8265,0.48,0.7015
"""25%""",295901.0,0.0,36.0,1.0,46.99,43.4,21.92,0.8965
"""50%""",319806.0,1.0,52.0,2.0,207.987,194.91,86.68,1.0
"""75%""",345106.0,2.0,68.0,4.0,361.2,336.0,160.93,1.0
"""max""",371503.0,6.0,85.0,10.0,3748.5,3748.5,1241.955,1.5373


### There is more...

In [49]:
df.select(
    (pl.col('Quantity') >= 4).sum()
)

Quantity
u32
4423


In [47]:
df.select(
    pl.col('Quantity').filter(pl.col('Store Name')=='Online store').sum()
)

Quantity
i64
25017


## Group by aggregations

### How to do it...

In [2]:
import polars as pl

In [3]:
df = pl.read_csv('../data/constoso_sales.csv', try_parse_dates=True)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB …","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store …","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB …","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


In [16]:
df.group_by('Brand')

<polars.dataframe.group_by.GroupBy at 0x105c7d510>

In [17]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity').sum().alias('Sum of Quantity'))
    .head()
)

Brand,Sum of Quantity
str,i64
"""Fabrikam""",1516
"""Adventure Work…",4616
"""Litware""",875
"""Northwind Trad…",638
"""Litware """,161


In [18]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').sum().alias('Sum of Quantity'),
        pl.col('Unit Price').mean().alias('Average Unit Price'),
    )
    .sort('Average Unit Price', descending=True)
    .head()
)

Brand,Sum of Quantity,Average Unit Price
str,f64,f64
"""Fabrikam """,82765.3,795.820192
"""Contoso """,232497.937,715.378268
"""Adventure Work…",908783.8045,620.330242
"""Litware """,33194.085,603.528818
"""Fabrikam""",262239.375,557.956117


In [19]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.count()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .head()
)

Brand,Average Unit Price,Average Unit Price 2,Customer Name,Category
str,f64,f64,str,str
"""A. Datum""",280.1,280.1,"""Blažena Salabo…","""Cameras and ca…"
"""Adventure Work…",620.33,620.33,"""Molly Walters""","""Home Appliance…"
"""Adventure Work…",166.9,166.9,"""James Steinfel…","""TV and Video"""
"""Contoso""",150.86,150.86,"""Eric Kennedy""","""Cell phones"""
"""Contoso """,715.38,715.38,"""Chiquita Boyd""","""Home Appliance…"


In [26]:
(
    pl.scan_csv('../data/constoso_sales.csv', try_parse_dates=True)
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.count()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .collect()
    .head()
)

Brand,Average Unit Price,Average Unit Price 2,Customer Name,Category
str,f64,f64,str,str
"""A. Datum""",280.1,280.1,"""Blažena Salabo…","""Cameras and ca…"
"""Adventure Work…",620.33,620.33,"""Molly Walters""","""Home Appliance…"
"""Adventure Work…",166.9,166.9,"""James Steinfel…","""TV and Video"""
"""Contoso""",150.86,150.86,"""Eric Kennedy""","""Cell phones"""
"""Contoso """,715.38,715.38,"""Chiquita Boyd""","""Home Appliance…"


In [4]:
pl.Config.set_fmt_str_lengths = 50
print(df.select('Brand').unique().head(10))

shape: (10, 1)
┌───────────────────┐
│ Brand             │
│ ---               │
│ str               │
╞═══════════════════╡
│ Litware           │
│ Fabrikam          │
│ Contoso           │
│ Fabrikam          │
│ …                 │
│ The Phone Company │
│ Proseware         │
│ Adventure Works   │
│ Adventure Works   │
└───────────────────┘


In [5]:
df.select('Brand').unique().head(10)

Brand
str
"""Fabrikam """
"""Proseware"""
"""The Phone Comp…"
"""A. Datum"""
"""Contoso """
"""Wide World Imp…"
"""Adventure Work…"
"""Southridge Vid…"
"""Adventure Work…"
"""Tailspin Toys"""


In [6]:
import os
os.environ['POLARS_FMT_STR_LEN'] = str(50)

df.select('Brand').unique().head(10)

Brand
str
"""Contoso"""
"""Adventure Works"""
"""Tailspin Toys"""
"""Contoso """
"""Proseware"""
"""Wide World Importers"""
"""A. Datum"""
"""Northwind Traders"""
"""Adventure Works """
"""Litware """


### There is more...

In [22]:
for name, data in df.group_by('Brand'):
    print(name, type(data))

Proseware <class 'polars.dataframe.frame.DataFrame'>
Litware  <class 'polars.dataframe.frame.DataFrame'>
A. Datum <class 'polars.dataframe.frame.DataFrame'>
Tailspin Toys <class 'polars.dataframe.frame.DataFrame'>
Contoso  <class 'polars.dataframe.frame.DataFrame'>
Fabrikam <class 'polars.dataframe.frame.DataFrame'>
Contoso <class 'polars.dataframe.frame.DataFrame'>
Adventure Works  <class 'polars.dataframe.frame.DataFrame'>
Adventure Works <class 'polars.dataframe.frame.DataFrame'>
The Phone Company <class 'polars.dataframe.frame.DataFrame'>
Northwind Traders <class 'polars.dataframe.frame.DataFrame'>
Wide World Importers <class 'polars.dataframe.frame.DataFrame'>
Litware <class 'polars.dataframe.frame.DataFrame'>
Fabrikam   <class 'polars.dataframe.frame.DataFrame'>
Southridge Video <class 'polars.dataframe.frame.DataFrame'>


In [24]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity'))
    .head()
)

Brand,Quantity
str,list[i64]
"""Litware ""","[2, 1, … 3]"
"""Adventure Work…","[5, 7, … 2]"
"""Contoso""","[7, 1, … 3]"
"""Fabrikam""","[2, 1, … 2]"
"""Adventure Work…","[2, 3, … 6]"


In [27]:
(
    df
    .group_by('Brand', maintain_order=True)
    .agg(pl.col('Quantity'))
    .head()
)

Brand,Quantity
str,list[i64]
"""Contoso""","[7, 1, … 3]"
"""Wide World Imp…","[2, 8, … 2]"
"""Northwind Trad…","[2, 3, … 2]"
"""Adventure Work…","[2, 3, … 6]"
"""Adventure Work…","[5, 7, … 2]"


In [33]:
(
    df
    .group_by(
        pl.col('Brand'), 
        'Customer Country',
        pl.col('Order Date').dt.year().alias('Order Year')
        )
    .agg(pl.col('Unit Price').mean())
    .head()
)

Brand,Customer Country,Order Year,Unit Price
str,str,i32,f64
"""Contoso""","""United Kingdom…",2018,149.204372
"""Contoso""","""Germany""",2018,156.208612
"""Contoso""","""Italy""",2017,217.8535
"""Wide World Imp…","""Netherlands""",2018,631.144348
"""Wide World Imp…","""United States""",2018,513.037234


## Aggregating values across multiple columns

### How to do it...

In [95]:
import polars as pl

In [96]:
df = pl.read_csv('../data/pokemon.csv')
df.head()

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False


In [97]:
(
    df
    .select('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed')
    .sum(axis=1).alias('Total 2')
    .head(5)
)

Total 2
i64
318
405
525
625
309


In [98]:
(
    df
    .with_columns(
        pl.sum_horizontal('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').alias('Total 2')
    )
    .head(5)
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,318
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,405
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,525
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,625
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,309


In [99]:
cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
(
    df
    .with_columns(
        pl.reduce(
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,318
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,405
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,525
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,625
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,309


In [100]:
(
    df
    .with_columns(
        pl.fold(
            acc=pl.lit(100), 
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,418
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,505
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,625
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,725
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,409


### There is more...

In [107]:
(
    df
    .filter(
        pl.fold(
            acc=pl.lit(True), 
            function=lambda acc, col: acc & col, 
            exprs=pl.col(cols) > 80
        )
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
144,"""Articuno""","""Ice""","""Flying""",580,90,85,100,95,125,85,1,True
145,"""Zapdos""","""Electric""","""Flying""",580,90,90,85,125,90,100,1,True
146,"""Moltres""","""Fire""","""Flying""",580,90,100,90,125,85,90,1,True
150,"""Mewtwo""","""Psychic""",,680,106,110,90,154,90,130,1,True


In [119]:
(
    df
    .filter(
        pl.all_horizontal(pl.col(cols) > 80)
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
144,"""Articuno""","""Ice""","""Flying""",580,90,85,100,95,125,85,1,True
145,"""Zapdos""","""Electric""","""Flying""",580,90,90,85,125,90,100,1,True
146,"""Moltres""","""Fire""","""Flying""",580,90,100,90,125,85,90,1,True
150,"""Mewtwo""","""Psychic""",,680,106,110,90,154,90,130,1,True


In [141]:
str_cols = ['Name', 'Type 1', 'Type 2']
str_combined = pl.fold(acc=pl.lit(''), function=lambda acc, col: acc + col, exprs=str_cols).alias('Str Combined')
str_cols.append(str_combined)
df.select(str_cols).head()

Name,Type 1,Type 2,Str Combined
str,str,str,str
"""Bulbasaur""","""Grass""","""Poison""","""BulbasaurGrassPoison"""
"""Ivysaur""","""Grass""","""Poison""","""IvysaurGrassPoison"""
"""Venusaur""","""Grass""","""Poison""","""VenusaurGrassPoison"""
"""VenusaurMega Venusaur""","""Grass""","""Poison""","""VenusaurMega VenusaurGrassPoison"""
"""Charmander""","""Fire""",,


In [142]:
str_cols = ['Name', 'Type 1', 'Type 2']
df.select(pl.concat_str(str_cols)).head()

Name
str
"""BulbasaurGrassPoison"""
"""IvysaurGrassPoison"""
"""VenusaurGrassPoison"""
"""VenusaurMega VenusaurGrassPoison"""
""
