# Chapter 4: Data Transformation Techniques

## Simple aggregations

### How to do it...

In [1]:
import polars as pl

In [2]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)

In [3]:
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store Western Australi…","""Contoso 512MB MP3 Player E51 B…","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB MP3 Player E51 B…","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


In [4]:
from polars import selectors as cs
(
    df
    .select(cs.numeric())
    .sum()
)

Order Number,Line Number,Customer Age,Quantity,Unit Price,Net Price,Unit Cost,Exchange Rate
i64,i64,i64,i64,f64,f64,f64,f64
4466019052,16195,725757,43517,4178500.0,3928600.0,1735600.0,14124.4597


In [5]:
s = df.select('Quantity').to_series()
"""
Selects the 'Quantity' column from the DataFrame, converts it to a series, and calculates the sum of the series.

Returns:
    int or float: The sum of the 'Quantity' column.
"""
s.sum()

43517

In [7]:
df.select(pl.col('Quantity').sum())
"""
Selects the 'Quantity' column from the DataFrame and computes the sum of its values.

Returns:
    DataFrame: A DataFrame containing the sum of the 'Quantity' column.
"""

"\nSelects the 'Quantity' column from the DataFrame and computes the sum of its values.\n\nReturns:\n    DataFrame: A DataFrame containing the sum of the 'Quantity' column.\n"

In [8]:
df.select(
    pl.col('Customer Name').first().alias('Cust Name First'),
    pl.col('Customer Name').last().alias('Cust Name Last')
)

Cust Name First,Cust Name Last
str,str
"""Eric Kennedy""","""Billy Ratliff"""


In [9]:
df.select(cs.numeric()).describe()

statistic,Order Number,Line Number,Customer Age,Quantity,Unit Price,Net Price,Unit Cost,Exchange Rate
str,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",13915.0,13915.0,13915.0,13915.0,13915.0,13915.0,13915.0,13915.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",320949.985771,1.163852,52.15645,3.127345,300.28425,282.32739,124.731364,1.015053
"""std""",28431.79136,1.361349,19.133881,2.233597,405.538975,381.738847,147.944094,0.171927
"""min""",269500.0,0.0,19.0,1.0,0.95,0.8265,0.48,0.7015
"""25%""",295902.0,0.0,36.0,1.0,46.99,43.4,21.92,0.8965
"""50%""",319806.0,1.0,52.0,2.0,207.987,194.91,86.68,1.0
"""75%""",345106.0,2.0,68.0,4.0,361.2,336.0,160.93,1.0
"""max""",371503.0,6.0,85.0,10.0,3748.5,3748.5,1241.955,1.5373


### There is more...

In [10]:
df.select(
    (pl.col('Quantity') >= 4).sum()
)

Quantity
u32
4423


In [11]:
df.select(
    pl.col('Quantity').filter(pl.col('Store Name')=='Online store').sum()
)

Quantity
i64
25017


## Using group by aggregations

### How to do it...

In [12]:
import polars as pl

In [13]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store Western Australi…","""Contoso 512MB MP3 Player E51 B…","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB MP3 Player E51 B…","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


In [14]:
df.group_by('Brand')

<polars.dataframe.group_by.GroupBy at 0x72eef6efb2f0>

In [15]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity').sum().alias('Sum of Quantity'))
    .head()
)

Brand,Sum of Quantity
str,i64
"""A. Datum""",555
"""Southridge Video""",6749
"""Litware """,161
"""Fabrikam """,332
"""Adventure Works""",4616


In [16]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').sum().alias('Sum of Quantity'),
        pl.col('Unit Price').mean().alias('Average Unit Price'),
    )
    .sort('Average Unit Price', descending=True)
    .head()
)

Brand,Sum of Quantity,Average Unit Price
str,f64,f64
"""Fabrikam """,82765.3,795.820192
"""Contoso """,232497.937,715.378268
"""Adventure Works""",908783.8045,620.330242
"""Litware """,33194.085,603.528818
"""Fabrikam""",262239.375,557.956117


In [17]:
(
    df
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.len()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .head()
)

Brand,Average Unit Price,Average Unit Price 2,Customer Name,Category
str,f64,f64,str,str
"""A. Datum""",280.1,280.1,"""Blažena Salabová""","""Cameras and camcorders """
"""Adventure Works""",620.33,620.33,"""Molly Walters""","""Home Appliances"""
"""Adventure Works """,166.9,166.9,"""James Steinfeld""","""TV and Video"""
"""Contoso""",150.86,150.86,"""Eric Kennedy""","""Cell phones"""
"""Contoso """,715.38,715.38,"""Chiquita Boyd""","""Home Appliances"""


In [18]:
(
    pl.scan_csv('../data/contoso_sales.csv', try_parse_dates=True)
    .group_by('Brand')
    .agg(
        pl.col('Unit Price').mean().round(2).alias('Average Unit Price'),
        (pl.col('Unit Price').sum() / pl.len()).round(2).alias('Average Unit Price 2'),
        pl.col('Customer Name').first(),
        pl.col('Category').last()
    )
    .sort('Average Unit Price', descending=True)
    .sort('Brand')
    .collect()
    .head()
)

Brand,Average Unit Price,Average Unit Price 2,Customer Name,Category
str,f64,f64,str,str
"""A. Datum""",280.1,280.1,"""Blažena Salabová""","""Cameras and camcorders """
"""Adventure Works""",620.33,620.33,"""Molly Walters""","""Home Appliances"""
"""Adventure Works """,166.9,166.9,"""James Steinfeld""","""TV and Video"""
"""Contoso""",150.86,150.86,"""Eric Kennedy""","""Cell phones"""
"""Contoso """,715.38,715.38,"""Chiquita Boyd""","""Home Appliances"""


In [19]:
pl.Config.set_fmt_str_lengths = 50
print(df.select('Brand').unique().head(10))

shape: (10, 1)
┌──────────────────────┐
│ Brand                │
│ ---                  │
│ str                  │
╞══════════════════════╡
│ Contoso              │
│ The Phone Company    │
│ Adventure Works      │
│ Wide World Importers │
│ Fabrikam             │
│ Fabrikam             │
│ Litware              │
│ Southridge Video     │
│ Tailspin Toys        │
│ Adventure Works      │
└──────────────────────┘


In [20]:
df.select('Brand').unique().head(10)

Brand
str
"""Adventure Works"""
"""Fabrikam"""
"""Contoso"""
"""Wide World Importers"""
"""Adventure Works """
"""A. Datum"""
"""Tailspin Toys"""
"""Contoso """
"""Northwind Traders"""
"""Fabrikam """


In [21]:
import os
os.environ['POLARS_FMT_STR_LEN'] = str(50)

df.select('Brand').unique().head(10)

Brand
str
"""Tailspin Toys"""
"""Contoso """
"""The Phone Company"""
"""A. Datum"""
"""Proseware"""
"""Northwind Traders"""
"""Adventure Works"""
"""Fabrikam """
"""Wide World Importers"""
"""Litware """


### There is more...

In [22]:
for name, data in df.group_by(['Brand']):
    print(name[0], type(data))

Wide World Importers <class 'polars.dataframe.frame.DataFrame'>
Contoso <class 'polars.dataframe.frame.DataFrame'>
A. Datum <class 'polars.dataframe.frame.DataFrame'>
Fabrikam <class 'polars.dataframe.frame.DataFrame'>
Tailspin Toys <class 'polars.dataframe.frame.DataFrame'>
Litware  <class 'polars.dataframe.frame.DataFrame'>
The Phone Company <class 'polars.dataframe.frame.DataFrame'>
Proseware <class 'polars.dataframe.frame.DataFrame'>
Litware <class 'polars.dataframe.frame.DataFrame'>
Contoso  <class 'polars.dataframe.frame.DataFrame'>
Adventure Works <class 'polars.dataframe.frame.DataFrame'>
Northwind Traders <class 'polars.dataframe.frame.DataFrame'>
Adventure Works  <class 'polars.dataframe.frame.DataFrame'>
Fabrikam   <class 'polars.dataframe.frame.DataFrame'>
Southridge Video <class 'polars.dataframe.frame.DataFrame'>


In [23]:
(
    df
    .group_by('Brand')
    .agg(pl.col('Quantity'))
    .head()
)

Brand,Quantity
str,list[i64]
"""Northwind Traders""","[2, 3, … 2]"
"""Litware""","[1, 1, … 2]"
"""A. Datum""","[2, 1, … 2]"
"""Proseware""","[4, 9, … 5]"
"""Fabrikam ""","[2, 4, … 3]"


In [24]:
(
    df
    .group_by('Brand', maintain_order=True)
    .agg(pl.col('Quantity'))
    .head()
)

Brand,Quantity
str,list[i64]
"""Contoso""","[7, 1, … 3]"
"""Wide World Importers""","[2, 8, … 2]"
"""Northwind Traders""","[2, 3, … 2]"
"""Adventure Works ""","[2, 3, … 6]"
"""Adventure Works""","[5, 7, … 2]"


In [27]:
# Groups the DataFrame by 'Brand', 'Customer Country', and the year of 'Order Date', 
# then calculates the mean of 'Unit Price' for each group and returns the first few rows.
# Returns:
#     DataFrame: A DataFrame with the grouped and aggregated data, showing the first few rows.
(
    df
    .group_by(
        pl.col('Brand'), 
        'Customer Country',
        pl.col('Order Date').dt.year().alias('Order Year')
    )
    .agg(pl.col('Unit Price').mean())
    .head()
)

Brand,Customer Country,Order Year,Unit Price
str,str,i32,f64
"""The Phone Company""","""Australia""",2019,288.54717
"""Adventure Works""","""Italy""",2020,1236.226
"""Litware""","""Australia""",2017,26.181
"""Fabrikam ""","""United States""",2020,443.285714
"""Tailspin Toys""","""Italy""",2019,17.495


## Aggregating values across multiple columns

### How to do it...

In [6]:
import polars as pl

In [28]:
df = pl.read_csv('../data/pokemon.csv')
df.head()

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False


In [31]:
"""
This code snippet selects specific columns from a DataFrame and calculates the horizontal sum of these columns.
The selected columns are 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', and 'Speed'.
The horizontal sum is then aliased as 'Total 2', and the first few rows of the resulting DataFrame are displayed using the head() method.
"""
(
    df
    .select('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed')
    .with_columns(
        pl.sum_horizontal('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').alias('Total 2')
    )
    .head()
)

HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total 2
i64,i64,i64,i64,i64,i64,i64
45,49,49,65,65,45,318
60,62,63,80,80,60,405
80,82,83,100,100,80,525
80,100,123,122,120,80,625
39,52,43,60,50,65,309


In [32]:
(
    df
    .with_columns(
        pl.sum_horizontal('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,318
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,405
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,525
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,625
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,309


In [33]:
(
    df
    .with_columns(
        pl.concat_list('HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed').list.sum().alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,318
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,405
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,525
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,625
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,309


In [35]:
cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
"""
This code snippet performs the following operations on a DataFrame `df`:

1. Defines a list of column names `cols` which includes 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', and 'Speed'.
2. Uses the `with_columns` method to add a new column to the DataFrame.
3. The new column, named 'Total 2', is created by summing the values of the columns specified in `cols` using the `pl.reduce` function.
4. The `head()` method is called to display the first few rows of the modified DataFrame.

Returns:
    A DataFrame with an additional column 'Total 2' that contains the sum of the specified columns for each row.
"""
(
    df
    .with_columns(
        pl.reduce(
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,318
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,405
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,525
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,625
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,309


The provided code snippet demonstrates how to manipulate a DataFrame 

df

 using the `polars` library in Python. The primary goal of this snippet is to add a new column to the DataFrame that represents the sum of several existing columns.

First, a list of column names 

cols

 is defined, which includes 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', and 'Speed'. These columns are presumably part of the DataFrame 

df

 and represent various attributes.

The 

with_columns

 method is then used to add a new column to the DataFrame. This method allows for the modification of the DataFrame by adding or transforming columns. Within this method, the 

pl.reduce

 function is employed to sum the values of the specified columns. The 

pl.reduce

 function takes a lambda function as its 

function

 argument, which defines how the columns should be combined. In this case, the lambda function adds the values of the columns (

acc + col

). The 

exprs

 argument specifies the columns to be summed, which are provided by 

pl.col(cols)

.

The result of the 

pl.reduce

 function is then given an alias 'Total 2', which becomes the name of the new column. This new column contains the sum of the values from the specified columns for each row in the DataFrame.

Finally, the 

head()

 method is called to display the first few rows of the modified DataFrame. This allows for a quick inspection of the changes made to the DataFrame, specifically the addition of the 'Total 2' column.

In summary, this code snippet adds a new column to the DataFrame 

df

 that contains the sum of the values from the 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', and 'Speed' columns for each row, and then displays the first few rows of the updated DataFrame.

In [38]:
# This code snippet performs the following operations on a DataFrame `df`:
#
# 1. Adds a new column named 'Total 2' to the DataFrame.
# 2. The new column is created by folding (reducing) the specified columns (`cols`) with an initial value of 100.
# 3. The folding operation sums the initial value (100) with each value in the specified columns.
# 4. The resulting DataFrame with the new column is then limited to the first few rows using the `head()` method.
#
# Parameters:
# - df: The input DataFrame.
# - cols: A list of column names to be included in the fold operation.
#
# Returns:
# - A DataFrame with the new 'Total 2' column and limited to the first few rows.

(
    df
    .with_columns(
        pl.fold(
            acc=pl.lit(100), 
            function=lambda acc, col: acc + col, 
            exprs=pl.col(cols)
        )
        .alias('Total 2')
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total 2
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool,i64
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False,418
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False,505
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False,625
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False,725
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False,409


This code snippet demonstrates how to add a new column to a DataFrame 

df

 using the `polars` library in Python. The new column, named 'Total 2', is created by performing a fold (reduce) operation on a specified list of columns (

cols

), starting with an initial value of 100.

The process begins by defining the DataFrame 

df

 and the list of column names 

cols

 that will be included in the fold operation. The 

with_columns

 method is used to add or transform columns in the DataFrame. Within this method, the 

pl.fold

 function is employed to perform the fold operation.

The 

pl.fold

 function takes three arguments:
1. 

acc=pl.lit(100)

: This sets the initial value of the accumulator to 100.
2. 

function=lambda acc, col: acc + col

: This lambda function defines how the fold operation should combine the columns. In this case, it adds the values of the columns to the accumulator.
3. 

exprs=pl.col(cols)

: This specifies the columns to be included in the fold operation, provided by 

pl.col(cols)

.

The result of the 

pl.fold

 function is then given an alias 'Total 2', which becomes the name of the new column. This new column contains the sum of the initial value (100) and the values from the specified columns for each row in the DataFrame.

Finally, the 

head()

 method is called to display the first few rows of the modified DataFrame. This allows for a quick inspection of the changes made to the DataFrame, specifically the addition of the 'Total 2' column.

In summary, this code snippet adds a new column to the DataFrame 

df

 that contains the sum of an initial value (100) and the values from the specified columns for each row, and then displays the first few rows of the updated DataFrame.

### There is more...

In [41]:
"""
Filters the DataFrame `df` to include only rows where all specified columns have values greater than 80,
and then returns the first few rows of the filtered DataFrame.

The filtering is done using the `pl.fold` function, which iteratively applies a logical AND operation
to check if all columns specified in `cols` have values greater than 80.

Returns:
    DataFrame: The first few rows of the filtered DataFrame.
"""
(
    df
    .filter(
        pl.fold(
            acc=pl.lit(True), 
            function=lambda acc, col: acc & col, 
            exprs=pl.col(cols) > 80
        )
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
144,"""Articuno""","""Ice""","""Flying""",580,90,85,100,95,125,85,1,True
145,"""Zapdos""","""Electric""","""Flying""",580,90,90,85,125,90,100,1,True
146,"""Moltres""","""Fire""","""Flying""",580,90,100,90,125,85,90,1,True
150,"""Mewtwo""","""Psychic""",,680,106,110,90,154,90,130,1,True


In [42]:
(
    df
    .filter(
        pl.all_horizontal(pl.col(cols) > 80)
    )
    .head()
)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
144,"""Articuno""","""Ice""","""Flying""",580,90,85,100,95,125,85,1,True
145,"""Zapdos""","""Electric""","""Flying""",580,90,90,85,125,90,100,1,True
146,"""Moltres""","""Fire""","""Flying""",580,90,100,90,125,85,90,1,True
150,"""Mewtwo""","""Psychic""",,680,106,110,90,154,90,130,1,True


In [44]:
str_cols = ['Name', 'Type 1', 'Type 2']
"""
This code snippet performs the following operations:
1. Defines a list of column names `str_cols` that contains string columns: 'Name', 'Type 1', and 'Type 2'.
2. Uses the `pl.fold` function to concatenate the values of the columns in `str_cols` into a single string column named 'Str Combined'.
    - `acc=pl.lit('')`: Initializes the accumulator with an empty string.
    - `function=lambda acc, col: acc + col`: Defines a lambda function to concatenate the accumulator with the current column value.
    - `exprs=str_cols`: Specifies the columns to be concatenated.
    - `.alias('Str Combined')`: Assigns the alias 'Str Combined' to the resulting concatenated column.
3. Appends the new 'Str Combined' column to the `str_cols` list.
4. Selects the columns in `str_cols` from the DataFrame `df` and displays the first few rows using the `head()` method.
"""
str_combined = pl.fold(acc=pl.lit(''), function=lambda acc, col: acc + col, exprs=str_cols).alias('Str Combined')
str_cols.append(str_combined)
df.select(str_cols).head()

Name,Type 1,Type 2,Str Combined
str,str,str,str
"""Bulbasaur""","""Grass""","""Poison""","""BulbasaurGrassPoison"""
"""Ivysaur""","""Grass""","""Poison""","""IvysaurGrassPoison"""
"""Venusaur""","""Grass""","""Poison""","""VenusaurGrassPoison"""
"""VenusaurMega Venusaur""","""Grass""","""Poison""","""VenusaurMega VenusaurGrassPoison"""
"""Charmander""","""Fire""",,


In [45]:
str_cols = ['Name', 'Type 1', 'Type 2']
df.select(pl.concat_str(str_cols)).head()

Name
str
"""BulbasaurGrassPoison"""
"""IvysaurGrassPoison"""
"""VenusaurGrassPoison"""
"""VenusaurMega VenusaurGrassPoison"""
""


## Computing over groups with window functions

### How to do it...

In [46]:
import polars as pl

In [47]:
import os 
os.environ['POLARS_FMT_STR_LEN'] = str(50) 

In [48]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df = df.with_columns(
    (pl.col('Quantity') * pl.col('Net Price')).round(2).alias('Sales Amount')
)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate,Sales Amount
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0,72.02
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967,11.69
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484,77.94
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store Western Australia""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545,51.96
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0,23.12


In [49]:
sales_by_cat = df.select(
    'Category',
    'Subcategory',
    pl.col('Sales Amount').sum().over('Category').alias('Sales Amt per Cat')
)
sales_by_cat.head()    

Category,Subcategory,Sales Amt per Cat
str,str,f64
"""Audio""","""MP4&MP3""",238356.0
"""Audio""","""MP4&MP3""",238356.0
"""Audio""","""MP4&MP3""",238356.0
"""Audio""","""MP4&MP3""",238356.0
"""Audio""","""MP4&MP3""",238356.0


In [50]:
sales_by_cat.filter(pl.col('Category')=='Audio').unique().head()

Category,Subcategory,Sales Amt per Cat
str,str,f64
"""Audio""","""MP4&MP3""",238356.0
"""Audio""","""Bluetooth Headphones""",238356.0
"""Audio""","""Recording Pen""",238356.0


In [51]:
df.shape, sales_by_cat.shape 

((13915, 21), (13915, 3))

In [52]:
(
    df
    .select(
        'Category',
        'Brand',
        'Subcategory',
        pl.col('Sales Amount').mean().over('Category', 'Brand').alias('Avg Sales per Cat and Brand')
    )
    .filter(
        (pl.col('Category')=='Computers') 
    )
    .unique()
    .sort('Brand')
    .head(10)
)

Category,Brand,Subcategory,Avg Sales per Cat and Brand
str,str,str,f64
"""Computers""","""Adventure Works""","""Laptops""",1797.371846
"""Computers""","""Adventure Works""","""Desktops""",1797.371846
"""Computers""","""Adventure Works""","""Monitors""",1797.371846
"""Computers""","""Contoso""","""Projectors & Screens""",689.986652
"""Computers""","""Contoso""","""Computers Accessories""",689.986652
"""Computers""","""Fabrikam""","""Laptops""",1982.066063
"""Computers""","""Proseware""","""Monitors""",1095.305012
"""Computers""","""Proseware""","""Laptops""",1095.305012
"""Computers""","""Proseware""","""Projectors & Screens""",1095.305012
"""Computers""","""Proseware""","""Printers, Scanners & Fax""",1095.305012


In [53]:
from datetime import date

curr_yr = date.today().year
cust_birth_yr = curr_yr - pl.col('Customer Age')

(
    df
    .select(
        'Category',
        'Brand',
        'Customer Age',
        pl.col('Sales Amount').mean().over('Category', cust_birth_yr).alias('Avg Sales per Cat') 
    )
    .filter(pl.col('Category')=='Computers')
    .unique()
    .sort('Customer Age')
    .head(10)
)

Category,Brand,Customer Age,Avg Sales per Cat
str,str,i64,f64
"""Computers""","""Adventure Works""",19,1665.993509
"""Computers""","""Fabrikam""",19,1665.993509
"""Computers""","""Contoso""",19,1665.993509
"""Computers""","""Proseware""",19,1665.993509
"""Computers""","""Southridge Video""",19,1665.993509
"""Computers""","""Wide World Importers""",19,1665.993509
"""Computers""","""Fabrikam""",20,2094.541563
"""Computers""","""Proseware""",20,2094.541563
"""Computers""","""Southridge Video""",20,2094.541563
"""Computers""","""Contoso""",20,2094.541563


In [54]:
(
    df
    .group_by('Category')
    .agg(pl.col('Sales Amount').max().alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).alias('Rank')
    )
    .sort('Rank')
)

Category,Max Sales Amt,Rank
str,f64,f64
"""TV and Video""",28999.9,1.0
"""Home Appliances""",28479.91,2.0
"""Computers""",19992.0,3.0
"""Cameras and camcorders """,10810.8,4.0
"""Cell phones""",5183.2,5.0
"""Music, Movies and Audio Books""",3041.88,6.0
"""Audio""",2871.2,7.0
"""Games and Toys""",2813.16,8.0


In [55]:
(
    df
    .group_by('Category', 'Subcategory')
    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')
    )
    .filter(pl.col('Category').is_in(['Audio', 'Computers']))
    .sort(['Category', 'Rank'])
)

Category,Subcategory,Max Sales Amt,Rank
str,str,i64,i64
"""Audio""","""Recording Pen""",2871,1
"""Audio""","""Bluetooth Headphones""",2250,2
"""Audio""","""MP4&MP3""",2095,3
"""Computers""","""Projectors & Screens""",19992,1
"""Computers""","""Laptops""",19485,2
"""Computers""","""Desktops""",14535,3
"""Computers""","""Monitors""",11425,4
"""Computers""","""Printers, Scanners & Fax""",2508,5
"""Computers""","""Computers Accessories""",2424,6


### There is more...

In [57]:
max_sales_rank = (
    df
    .group_by('Category', 'Subcategory')
    .agg(pl.col('Sales Amount').max().round().cast(pl.Int64).alias('Max Sales Amt'))
    .with_columns(
        pl.col('Max Sales Amt').rank(descending=True).over('Category').cast(pl.Int64).alias('Rank')
    )
    .filter(pl.col('Category').is_in(['Audio', 'Computers']))
    .sort(['Category', 'Rank'])
)

In [61]:
"""
This code snippet performs the following operations on the `max_sales_rank` DataFrame:

1. Adds a new column to the DataFrame using the `with_columns` method.
2. Selects the 'Subcategory' column.
3. Sorts the 'Subcategory' column by the 'Max Sales Amt' column.
4. Selects the top 3 rows of the sorted 'Subcategory' column.
5. Applies the operation over the 'Category' column using the 'join' mapping strategy.
6. Renames the resulting column to 'Lowest 3 Subcat per Cat'.

The resulting DataFrame will have a new column that contains the lowest 3 subcategories per category based on the maximum sales amount.
"""
max_sales_rank.with_columns(
    pl.col('Subcategory')
    .sort_by('Max Sales Amt')
    .head(3)
    .over('Category', mapping_strategy='join')
    .alias('Lowest 3 Subcat per Cat')
)

Category,Subcategory,Max Sales Amt,Rank,Lowest 3 Subcat per Cat
str,str,i64,i64,list[str]
"""Audio""","""Recording Pen""",2871,1,"[""MP4&MP3"", ""Bluetooth Headphones"", ""Recording Pen""]"
"""Audio""","""Bluetooth Headphones""",2250,2,"[""MP4&MP3"", ""Bluetooth Headphones"", ""Recording Pen""]"
"""Audio""","""MP4&MP3""",2095,3,"[""MP4&MP3"", ""Bluetooth Headphones"", ""Recording Pen""]"
"""Computers""","""Projectors & Screens""",19992,1,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"
"""Computers""","""Laptops""",19485,2,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"
"""Computers""","""Desktops""",14535,3,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"
"""Computers""","""Monitors""",11425,4,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"
"""Computers""","""Printers, Scanners & Fax""",2508,5,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"
"""Computers""","""Computers Accessories""",2424,6,"[""Computers Accessories"", ""Printers, Scanners & Fax"", ""Monitors""]"


This code snippet demonstrates how to manipulate a DataFrame 

max_sales_rank

 using the `polars` library in Python. The primary goal is to add a new column that contains the lowest 3 subcategories per category based on the maximum sales amount.

The process begins by using the 

with_columns

 method, which allows for the addition or transformation of columns in the DataFrame. Within this method, the 

pl.col('Subcategory')

 function is used to select the 'Subcategory' column from the DataFrame.

Next, the selected 'Subcategory' column is sorted by the 'Max Sales Amt' column using the 

sort_by('Max Sales Amt')

 method. This ensures that the subcategories are ordered based on their maximum sales amounts.

After sorting, the 

head(3)

 method is called to select the top 3 rows of the sorted 'Subcategory' column. This step isolates the three subcategories with the lowest maximum sales amounts.

The 

over('Category', mapping_strategy='join')

 method is then applied to perform the operation over the 'Category' column. The 'join' mapping strategy ensures that the operation is applied within each category, effectively grouping the subcategories by their respective categories.

Finally, the resulting column is renamed to 'Lowest 3 Subcat per Cat' using the 

alias('Lowest 3 Subcat per Cat')

 method. This new column is added to the DataFrame, containing the lowest 3 subcategories per category based on the maximum sales amount.

In summary, this code snippet adds a new column to the 

max_sales_rank

 DataFrame that lists the lowest 3 subcategories for each category, sorted by their maximum sales amounts. The new column is named 'Lowest 3 Subcat per Cat', and the DataFrame is updated to include this additional information.

In [64]:
"""
This code snippet sorts the 'Subcategory' column by 'Max Sales Amt' within each 'Category' and adds the sorted result as a new column named 'Subcategory Sorted by Max Sales Amt Ascending'.

- `pl.col('Subcategory')`: Selects the 'Subcategory' column.
- `sort_by('Max Sales Amt')`: Sorts the 'Subcategory' column by the 'Max Sales Amt' column.
- `over('Category', mapping_strategy='explode')`: Applies the sorting operation within each 'Category' group and uses the 'explode' mapping strategy to handle the grouping.
- `alias('Subcategory Sorted by Max Sales Amt Ascending')`: Renames the resulting sorted column to 'Subcategory Sorted by Max Sales Amt Ascending'.
"""
max_sales_rank.with_columns(
    pl.col('Subcategory')
    .sort_by('Max Sales Amt')
    .over('Category', mapping_strategy='explode')
    .alias('Subcategory Sorted by Max Sales Amt Ascending')
)

Category,Subcategory,Max Sales Amt,Rank,Subcategory Sorted by Max Sales Amt Ascending
str,str,i64,i64,str
"""Audio""","""Recording Pen""",2871,1,"""MP4&MP3"""
"""Audio""","""Bluetooth Headphones""",2250,2,"""Bluetooth Headphones"""
"""Audio""","""MP4&MP3""",2095,3,"""Recording Pen"""
"""Computers""","""Projectors & Screens""",19992,1,"""Computers Accessories"""
"""Computers""","""Laptops""",19485,2,"""Printers, Scanners & Fax"""
"""Computers""","""Desktops""",14535,3,"""Monitors"""
"""Computers""","""Monitors""",11425,4,"""Desktops"""
"""Computers""","""Printers, Scanners & Fax""",2508,5,"""Laptops"""
"""Computers""","""Computers Accessories""",2424,6,"""Projectors & Screens"""


This code snippet demonstrates how to sort a DataFrame column within groups and add the sorted result as a new column using the `polars` library in Python. The goal is to sort the 'Subcategory' column by the 'Max Sales Amt' column within each 'Category' and add this sorted result as a new column named 'Subcategory Sorted by Max Sales Amt Ascending'.

The process begins by selecting the 'Subcategory' column using 

pl.col('Subcategory')

. This function is used to specify the column that will be manipulated.

Next, the 

sort_by('Max Sales Amt')

 method is called to sort the 'Subcategory' column based on the values in the 'Max Sales Amt' column. This ensures that the subcategories are ordered according to their maximum sales amounts.

The 

over('Category', mapping_strategy='explode')

 method is then applied to perform the sorting operation within each 'Category' group. The 'explode' mapping strategy is used to handle the grouping, ensuring that the sorting is applied separately within each category.

Finally, the resulting sorted column is renamed to 'Subcategory Sorted by Max Sales Amt Ascending' using the 

alias('Subcategory Sorted by Max Sales Amt Ascending')

 method. This new column is added to the DataFrame, containing the subcategories sorted by their maximum sales amounts within each category.

In summary, this code snippet adds a new column to the 

max_sales_rank

 DataFrame that lists the subcategories sorted by their maximum sales amounts within each category. The new column is named 'Subcategory Sorted by Max Sales Amt Ascending', and the DataFrame is updated to include this additional information.

In [65]:
(
    max_sales_rank
    .sort('Subcategory')
    .with_columns(
        pl.col('Subcategory')
        .sort_by('Max Sales Amt')
        .over('Category', mapping_strategy='explode')
        .alias('Subcategory Sorted by Max Sales Amt Ascending')
    )
)

Category,Subcategory,Max Sales Amt,Rank,Subcategory Sorted by Max Sales Amt Ascending
str,str,i64,i64,str
"""Audio""","""Bluetooth Headphones""",2250,2,"""MP4&MP3"""
"""Computers""","""Computers Accessories""",2424,6,"""Bluetooth Headphones"""
"""Computers""","""Desktops""",14535,3,"""Recording Pen"""
"""Computers""","""Laptops""",19485,2,"""Computers Accessories"""
"""Audio""","""MP4&MP3""",2095,3,"""Printers, Scanners & Fax"""
"""Computers""","""Monitors""",11425,4,"""Monitors"""
"""Computers""","""Printers, Scanners & Fax""",2508,5,"""Desktops"""
"""Computers""","""Projectors & Screens""",19992,1,"""Laptops"""
"""Audio""","""Recording Pen""",2871,1,"""Projectors & Screens"""


## Applying UDFs

### How to do it...

In [66]:
import polars as pl

In [67]:
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store Western Australia""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


In [68]:
def get_first_name(full_name: str) -> str:
    return full_name.split(' ')[0]

In [73]:
"""
Selects the 'Customer Name' column from the DataFrame and creates a new column 'Customer First Name' 
by applying the `get_first_name` function to each element in the 'Customer Name' column.

Returns:
    DataFrame: A DataFrame with the original 'Customer Name' column and a new 'Customer First Name' column.
"""
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: get_first_name(el), return_dtype=pl.String).alias('Customer First Name')
).head()

Customer Name,Customer First Name
str,str
"""Eric Kennedy""","""Eric"""
"""George Tooth""","""George"""
"""Caleb Greene""","""Caleb"""
"""Isaac Siddins""","""Isaac"""
"""Mike McQueen""","""Mike"""


This code snippet demonstrates how to manipulate a DataFrame using the `polars` library in Python to extract and create a new column based on existing data. The primary goal is to create a new column 'Customer First Name' by extracting the first name from the 'Customer Name' column.

The process begins by selecting the 'Customer Name' column from the DataFrame 

df

 using the 

select

 method. This method allows for the selection of specific columns from the DataFrame.

Next, the 

pl.col('Customer Name')

 function is used to specify the 'Customer Name' column for further manipulation. The 

map_elements

 method is then applied to this column, which allows for the application of a function to each element in the column. In this case, a lambda function is used to apply the 

get_first_name

 function to each element in the 'Customer Name' column. The 

get_first_name

 function is assumed to extract the first name from a full name string.

The 

return_dtype=pl.String

 argument specifies that the resulting data type of the new column should be a string. The 

alias('Customer First Name')

 method is used to rename the resulting column to 'Customer First Name'.

Finally, the 

head()

 method is called to display the first few rows of the modified DataFrame. This allows for a quick inspection of the changes made to the DataFrame, specifically the addition of the 'Customer First Name' column.

In summary, this code snippet selects the 'Customer Name' column from the DataFrame 

df

 and creates a new column 'Customer First Name' by applying the 

get_first_name

 function to each element in the 'Customer Name' column. The resulting DataFrame includes both the original 'Customer Name' column and the new 'Customer First Name' column, and the first few rows of the DataFrame are displayed for inspection.

In [74]:
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0], return_dtype=pl.String).alias('Customer First Name')
).head()

Customer Name,Customer First Name
str,str
"""Eric Kennedy""","""Eric"""
"""George Tooth""","""George"""
"""Caleb Greene""","""Caleb"""
"""Isaac Siddins""","""Isaac"""
"""Mike McQueen""","""Mike"""


In [75]:
def age_to_range(age: int) -> str:
    if age < 18:
        return '~17'
    elif age <= 30:
        return '18~30'
    elif age <= 50:
        return '31~50'
    elif age <= 70:
        return '50~70'
    else:
        return '71~'

In [76]:
df.select(
    'Customer Age',
    pl.col('Customer Age').map_elements(lambda el: age_to_range(el), return_dtype=pl.String).alias('Age Range')
).head()

Customer Age,Age Range
i64,str
47,"""31~50"""
30,"""18~30"""
59,"""50~70"""
25,"""18~30"""
56,"""50~70"""


### There is more...

In [77]:
df.select(
    'Customer Name',
    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')
).head()

Customer Name,Customer First Name
str,str
"""Eric Kennedy""","""Eric"""
"""George Tooth""","""George"""
"""Caleb Greene""","""Caleb"""
"""Isaac Siddins""","""Isaac"""
"""Mike McQueen""","""Mike"""


In [78]:
df.select(
    'Customer Age',
    pl.when(pl.col('Customer Age')<18).then(pl.lit('~17'))
    .when(pl.col('Customer Age')<=30).then(pl.lit('18~30'))
    .when(pl.col('Customer Age')<=50).then(pl.lit('31~50'))
    .when(pl.col('Customer Age')<=70).then(pl.lit('51~70'))
    .when(pl.col('Customer Age')>70).then(pl.lit('71~'))
    .alias('Age Range')
).head()

Customer Age,Age Range
i64,str
47,"""31~50"""
30,"""18~30"""
59,"""51~70"""
25,"""18~30"""
56,"""51~70"""


In [80]:
%%timeit
"""
This code snippet uses the Polars library to select and transform columns in a DataFrame.

It performs the following operations:
1. Selects the 'Customer Name' column.
2. Applies a transformation to the 'Customer Name' column to extract the first name by splitting the string at the space character.
3. Renames the transformed column to 'Customer First Name'.
4. Returns the first few rows of the resulting DataFrame.

The %%timeit magic command is used to measure the execution time of the code.

Returns:
    A DataFrame with the original 'Customer Name' column and a new 'Customer First Name' column containing the first names.
"""
df.select(
    'Customer Name',
    pl.col('Customer Name').map_elements(lambda el: el.split(' ')[0], return_dtype=pl.String).alias('Customer First Name')
).head()

5.85 ms ± 747 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [81]:
%%timeit
df.select(
    'Customer Name',
    pl.col('Customer Name').str.split(' ').list.first().alias('Customer First Name')
).head()

1.11 ms ± 55.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Using SQL for data transformations

In [82]:
import polars as pl
df = pl.read_csv('../data/contoso_sales.csv', try_parse_dates=True)
df.head()

Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,date,date,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,2017-10-18,2017-10-20,"""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0
285506,1,2017-10-25,2017-10-26,"""George Tooth""","""Male""","""Australia""",30,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",1,11.691,11.691,5.958,"""AUD""",1.2967
311002,2,2018-07-07,2018-07-12,"""Caleb Greene""","""Male""","""Australia""",59,"""Online store""","""Contoso 512MB MP3 Player E51 Silver""","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",6,12.99,12.99,6.62,"""AUD""",1.3484
366307,2,2020-01-11,2020-01-11,"""Isaac Siddins""","""Male""","""Australia""",25,"""Contoso Store Western Australia""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",4,12.99,12.99,6.62,"""AUD""",1.4545
325708,3,2018-12-01,2018-12-02,"""Mike McQueen""","""Male""","""United States""",56,"""Online store""","""Contoso 512MB MP3 Player E51 Blue""","""Blue""","""Contoso""","""Audio""","""MP4&MP3""",2,12.99,11.5611,6.62,"""USD""",1.0


### How to do it...

In [85]:
# This code snippet demonstrates how to use the Polars SQLContext to execute a SQL query on a registered DataFrame.
#
# Steps:
# 1. Create an instance of `pl.SQLContext` with eager execution enabled.
# 2. Register the DataFrame `df` with the context under the name 'df'.
# 3. Execute a SQL query to select the columns 'Customer Name', 'Brand', and 'Category' from the registered DataFrame and limit the result to 5 rows.
#
# Returns:
# - A DataFrame containing the first 5 rows with the specified columns from the original DataFrame.

ctx = pl.SQLContext(eager=True)
ctx.register('df', df)
ctx.execute(
    """
      select
        `Customer Name`,
        Brand,
        Category
      from df limit 5
    """
)


Customer Name,Brand,Category
str,str,str
"""Eric Kennedy""","""Contoso""","""Audio"""
"""George Tooth""","""Contoso""","""Audio"""
"""Caleb Greene""","""Contoso""","""Audio"""
"""Isaac Siddins""","""Contoso""","""Audio"""
"""Mike McQueen""","""Contoso""","""Audio"""


In [86]:
ctx.execute(
    """
      select
        Brand,
        avg(Quantity) as `Avg Quantity` 
      from df
      group by 
        Brand
      order by 
        `Avg Quantity` desc
      limit 5
    """
)

Brand,Avg Quantity
str,f64
"""Fabrikam""",3.225532
"""Northwind Traders""",3.222222
"""Wide World Importers""",3.193811
"""Fabrikam """,3.192308
"""Southridge Video""",3.189509


In [87]:
pl.SQLContext(lf=df.lazy()).execute(
    """
        select 
            Brand,
            Category
        from lf
        limit 5
    """
).collect()

Brand,Category
str,str
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
"""Contoso""","""Audio"""
