### Advanced Data Transformation with Polars

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code.

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [2]:
import polars as pl

# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [3]:
pokemon = pl.read_parquet('../datasets/pokemon_simplified.parquet')

In [4]:
pokemon.head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


#### Conditions

In [5]:
# Identify the top 25% of Pokemon based on the attack stats
# Use the quantile function to get the 75th percentile

pokemon.select(
    'name', 'attack',
    attack_category = 
    pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
    .then('Top 25%')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,
"""Ivysaur""",62,
"""Venusaur""",100,
"""Charmander""",52,
"""Charmeleon""",64,
"""Charizard""",104,"""Top 25%"""


In [6]:
# Identify the top 25% of Pokemon based on the attack stats
# Use otherwise to assign the remaining 75% of Pokemon to the 'Other' category

pokemon.select(
    'name', 'attack',
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
        .then('Top 25%')
        .otherwise('Other')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Other"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Other"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [7]:
# Identify the top 25% of Pokemon based on the attack stats AND the bottom 25%
# Use multiple when then statements

pokemon.select(
    'name', 'attack',
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
        .then('Top 25%')
        .when(pl.col('attack') < pl.col('attack').quantile(0.25))
        .then('Bottom 25%')
        .otherwise('Other')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Bottom 25%"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Bottom 25%"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [8]:
# We can use map_dict instead of pl.when().then() to map colors to Pokemon types

type_colors = {'grass': 'green', 'fire': 'red', 'water': 'blue'}

pokemon.select(
    'name', 'type1',
    color = pl.col('type1').map_dict(type_colors)
).head(6)

name,type1,color
str,str,str
"""Bulbasaur""","""grass""","""green"""
"""Ivysaur""","""grass""","""green"""
"""Venusaur""","""grass""","""green"""
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""","""red"""
"""Charizard""","""fire""","""red"""


#### One hot encoding

In [9]:
# Convert the column type1 to dummies

pokemon.head(6).select('name','type1').to_dummies(columns=['type1'])

name,type1_fire,type1_grass
str,u8,u8
"""Bulbasaur""",0,1
"""Ivysaur""",0,1
"""Venusaur""",0,1
"""Charmander""",1,0
"""Charmeleon""",1,0
"""Charizard""",1,0


#### Aggregating data

In [10]:
# Group by type and calculate median stats
# Group by is run in parallel so the order of the rows can change

pokemon.groupby('type1').agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""ice""",70.0,65.0,70.0,59.0
"""normal""",70.0,75.0,60.0,68.0
"""bug""",60.0,65.0,60.0,58.5


In [11]:
# Group by type and calculate median stats
# Maintaining the order is possible but it is slower

pokemon.groupby('type1', maintain_order=True).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""grass""",65.0,68.0,66.0,55.0
"""fire""",65.0,79.5,61.5,72.5
"""water""",68.0,70.0,70.0,64.0


In [12]:
# Group on multiple columns works the same way

pokemon.groupby(['type1', 'type2']).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,type2,hp,attack,defense,speed
str,str,f64,f64,f64,f64
"""water""","""poison""",65.0,70.0,65.0,85.0
"""ghost""","""poison""",45.0,50.0,45.0,95.0
"""normal""",,75.0,80.0,63.0,60.0


In [13]:
# Lots of aggregations functions are available

pokemon.groupby('type1').agg(
    avg_attack = pl.col('attack').mean(),
    max_attack = pl.col('attack').max(),
    quantile_75 = pl.col('attack').quantile(0.75),
    count = pl.count(), 
    first_pokemon = pl.first('name'), 
    strongest_pokemon = pl.col('name').sort_by('attack', descending=True).first() # 
).head(3)

type1,avg_attack,max_attack,quantile_75,count,first_pokemon,strongest_pokemon
str,f64,i64,f64,u32,str,str
"""ice""",73.304348,130,95.0,23,"""Jynx""","""Mamoswine"""
"""water""",73.307018,155,90.0,114,"""Squirtle""","""Gyarados"""
"""fighting""",99.178571,145,124.0,28,"""Mankey""","""Lucario"""


In [14]:
# Possible to use Numpy universal functions (ufuncs) in aggregations

import numpy as np

pokemon.groupby('type1').agg(
    sqrt_attack_mean = np.sqrt(pl.col('attack')).mean() # sqrt is the square root function
).head(3)

type1,sqrt_attack_mean
str,f64
"""rock""",9.342125
"""steel""",9.516288
"""poison""",8.445286


In [15]:
# If aggregation functions are not specified, the result if aggregated in a list

pokemon.groupby('type1').agg(
    pl.col('name').head(3)
).head(3)

type1,name
str,list[str]
"""fighting""","[""Mankey"", ""Primeape"", ""Machop""]"
"""psychic""","[""Abra"", ""Kadabra"", ""Alakazam""]"
"""poison""","[""Ekans"", ""Arbok"", ""Nidoran♀""]"


In [16]:
# Concatenate the names of the pokemon into a string

pokemon.groupby('type1').agg(
    pl.col('name').head(3).str.concat(', ')
).head(3)

type1,name
str,str
"""electric""","""Pikachu, Raichu, Magnemite"""
"""ice""","""Jynx, Articuno, Swinub"""
"""ghost""","""Gastly, Haunter, Gengar"""


In [17]:
# Aggregation with conditions
# Calculate the average attack for Pokemons with a speed above 100, by type

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_100 = 
        pl.col('attack').filter(pl.col('speed') > 100).mean(),
    avg_attack_when_speed_below_100 = 
        pl.col('attack').filter(pl.col('speed') < 100).mean()
).head(3)


type1,avg_attack_when_speed_above_100,avg_attack_when_speed_below_100
str,f64,f64
"""rock""",119.5,86.230769
"""bug""",105.3,64.52459
"""fighting""",120.666667,96.458333


In [18]:
# We can also create functions to use in aggregations

def avg_attack_when_speed_above_threshold(speed):
    result = (
        pl.col('attack')
        .filter(pl.col('speed') > speed)
        .mean()
        .alias(f'avg_attack_when_speed_above_{speed}'))
    return result

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_threshold(speed) for speed in range(30,91,30)
).head(3)

type1,avg_attack_when_speed_above_30,avg_attack_when_speed_above_60,avg_attack_when_speed_above_90
str,f64,f64,f64
"""grass""",76.181818,83.870968,95.4
"""fighting""",100.62963,106.076923,113.4
"""ground""",94.966667,98.733333,111.4


In [19]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health in each group
(
    pokemon.groupby('type1').agg(
        top_3_pokemon_by_attack = 
            pl.col('name').sort_by('attack', descending=True).head(3),
        top_3_pokemon_by_defense = 
            pl.col('name').sort_by('defense', descending=True).head(3) 
    )
).head(5)

type1,top_3_pokemon_by_attack,top_3_pokemon_by_defense
str,list[str],list[str]
"""fire""","[""Blaziken"", ""Flareon"", ""Ho-Oh""]","[""Torkoal"", ""Turtonator"", ""Magcargo""]"
"""normal""","[""Slaking"", ""Regigigas"", ""Lopunny""]","[""Audino"", ""Arceus"", ""Regigigas""]"
"""dark""","[""Absol"", ""Yveltal"", ""Honchkrow""]","[""Sableye"", ""Scrafty"", ""Umbreon""]"
"""electric""","[""Electivire"", ""Luxray"", ""Eelektross""]","[""Magnezone"", ""Rotom"", ""Ampharos""]"
"""rock""","[""Rampardos"", ""Tyranitar"", ""Diancie""]","[""Regirock"", ""Bastiodon"", ""Onix""]"


In [20]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health without grouping by type

pokemon.select(
    top_3_pokemon_by_attack = 
        pl.col('name').sort_by('attack', descending=True).head(3).implode(),
    top_3_pokemon_by_defense = 
        pl.col('name').sort_by('defense', descending=True).head(3).implode()
)

top_3_pokemon_by_attack,top_3_pokemon_by_defense
list[str],list[str]
"[""Heracross"", ""Kartana"", ""Groudon""]","[""Steelix"", ""Shuckle"", ""Aggron""]"


In [21]:
# I can iterate over the groups and do any action on them

for name, data in pokemon.groupby("is_legendary"):
    print(name)
    print(data.head(2))

0
shape: (2, 9)
┌───────────┬───────┬────────┬────────────────┬───┬────────┬─────────┬───────┬──────────────┐
│ name      ┆ type1 ┆ type2  ┆ abilities      ┆ … ┆ attack ┆ defense ┆ speed ┆ is_legendary │
│ ---       ┆ ---   ┆ ---    ┆ ---            ┆   ┆ ---    ┆ ---     ┆ ---   ┆ ---          │
│ str       ┆ str   ┆ str    ┆ list[str]      ┆   ┆ i64    ┆ i64     ┆ i64   ┆ i64          │
╞═══════════╪═══════╪════════╪════════════════╪═══╪════════╪═════════╪═══════╪══════════════╡
│ Bulbasaur ┆ grass ┆ poison ┆ ["Overgrow",   ┆ … ┆ 49     ┆ 49      ┆ 45    ┆ 0            │
│           ┆       ┆        ┆ "Chlorophyll"] ┆   ┆        ┆         ┆       ┆              │
│ Ivysaur   ┆ grass ┆ poison ┆ ["Overgrow",   ┆ … ┆ 62     ┆ 63      ┆ 60    ┆ 0            │
│           ┆       ┆        ┆ "Chlorophyll"] ┆   ┆        ┆         ┆       ┆              │
└───────────┴───────┴────────┴────────────────┴───┴────────┴─────────┴───────┴──────────────┘
1
shape: (2, 9)
┌──────────┬──────────┬─────

#### Window functions

In [22]:
# For each pokemon show its attack, the average attack for pokemons of the same type 1
# and the percentage difference between the two

pokemon.select(
    'name', 'type1', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1').round(1),
    pct_difference_vs_avg = 
        (((pl.col('attack') / pl.mean('attack')).over('type1') -1) * 100).round(1)
).head(5)

name,type1,attack,avg_attack_for_same_type1,pct_difference_vs_avg
str,str,i64,f64,f64
"""Bulbasaur""","""grass""",49,73.8,-33.6
"""Ivysaur""","""grass""",62,73.8,-16.0
"""Venusaur""","""grass""",100,73.8,35.6
"""Charmander""","""fire""",52,81.5,-36.2
"""Charmeleon""","""fire""",64,81.5,-21.5


In [23]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1 and pokemons of the same type 2

pokemon.select(
    'name', 'type1', 'type2', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1'),
    avg_attack_for_same_type2 = pl.mean('attack').over('type2')
).head(5)

name,type1,type2,attack,avg_attack_for_same_type1,avg_attack_for_same_type2
str,str,str,i64,f64,f64
"""Bulbasaur""","""grass""","""poison""",49,73.769231,67.617647
"""Ivysaur""","""grass""","""poison""",62,73.769231,67.617647
"""Venusaur""","""grass""","""poison""",100,73.769231,67.617647
"""Charmander""","""fire""",,52,81.5,74.231771
"""Charmeleon""","""fire""",,64,81.5,74.231771


In [24]:
# For each pokemon, show the top 3 pokemon by attack of the same type

pokemon.select(
    'name', 'type1', 'attack',
    top_3_attack_pokemon_same_type = 
        pl.col('name').sort_by('attack', descending=True).head(3)
        .over('type1', mapping_strategy='join')
).head(5)

name,type1,attack,top_3_attack_pokemon_same_type
str,str,i64,list[str]
"""Bulbasaur""","""grass""",49,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Ivysaur""","""grass""",62,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Venusaur""","""grass""",100,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Charmander""","""fire""",52,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"
"""Charmeleon""","""fire""",64,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"


In [25]:
# Show top 3 pokemon by attack and defense

pokemon.select(
    pl.col('type1').head(3).over('type1', mapping_strategy='explode'),
    top_3_attack = 
        pl.col('name').sort_by('attack', descending=True)
        .head(3).over('type1', mapping_strategy='explode'), 
    top_3_defense = 
        pl.col('name').sort_by('defense', descending=True)
        .head(3).over('type1', mapping_strategy='explode')
).head(6)

type1,top_3_attack,top_3_defense
str,str,str
"""grass""","""Kartana""","""Ferrothorn"""
"""grass""","""Abomasnow""","""Kartana"""
"""grass""","""Breloom""","""Leafeon"""
"""fire""","""Blaziken""","""Torkoal"""
"""fire""","""Flareon""","""Turtonator"""
"""fire""","""Ho-Oh""","""Magcargo"""


In [26]:
# Rank the pokemon by attack and defense by type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    attack_rank = 
        pl.col('attack').rank(method='ordinal', descending=True).over('type1'),
    defense_rank = 
        pl.col('defense').rank(method='ordinal', descending=True).over('type1')
).head(6)

name,type1,attack,defense,attack_rank,defense_rank
str,str,i64,i64,u32,u32
"""Bulbasaur""","""grass""",49,49,61,62
"""Ivysaur""","""grass""",62,63,45,45
"""Venusaur""","""grass""",100,123,16,5
"""Charmander""","""fire""",52,43,45,44
"""Charmeleon""","""fire""",64,58,37,29
"""Charizard""","""fire""",104,78,10,14


#### Range function

In [27]:
# Add an increasing index for the pokemon of each type

pokemon.select(
    'name', 'type1',
    type_index = pl.arange(1, pl.count()+1).over('type1')
).head(6)

name,type1,type_index
str,str,i64
"""Bulbasaur""","""grass""",1
"""Ivysaur""","""grass""",2
"""Venusaur""","""grass""",3
"""Charmander""","""fire""",1
"""Charmeleon""","""fire""",2
"""Charizard""","""fire""",3


In [28]:
# Take a random sample of 2 pokemons for each group

pokemon.select('name','type1').filter(
    pl.arange(1, pl.count()+1).shuffle().over("type1") <= 2
).sort(by='type1').head(4)

name,type1
str,str
"""Venomoth""","""bug"""
"""Pinsir""","""bug"""
"""Vullaby""","""dark"""
"""Mandibuzz""","""dark"""


In [29]:
# Take a random sample of 5% of pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).shuffle().over("type1") 
    <= pl.count().over("type1") * 0.05
).sort(by="type1").head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Caterpie""","""bug""",,"[""Shield Dust"", ""Run Away""]",45,30,35,45,0
"""Kricketot""","""bug""",,"[""Shed Skin"", ""Run Away""]",37,25,41,25,0
"""Mothim""","""bug""","""flying""","[""Swarm"", ""Tinted Lens""]",70,94,50,66,0
"""Vullaby""","""dark""","""flying""","[""Big Pecks"", ""Overcoat"", ""Weak Armor""]",70,55,75,60,0
"""Reshiram""","""dragon""","""fire""","[""Turboblaze""]",100,120,100,90,1
"""Thundurus""","""electric""","""flying""","[""Prankster"", ""Defiant"", ""Volt Absorb""]",79,105,70,101,1


In [30]:
# Get the 5th pokemon for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).over("type1") == 5
).head(5)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Kakuna""","""bug""","""poison""","[""Shed Skin""]",45,25,50,35,0
"""Raticate""","""normal""","""dark""","[""Run Away"", ""Guts"", … ""Thick Fat""]",75,71,70,77,0
"""Nidoqueen""","""poison""","""ground""","[""Poison Point"", ""Rivalry"", ""Sheer Force""]",90,92,87,76,0
"""Ninetales""","""fire""","""ice""","[""Flash Fire"", ""Drought"", … ""Snow Warning""]",73,67,75,109,0
"""Gloom""","""grass""","""poison""","[""Chlorophyll"", ""Stench""]",60,65,70,40,0


#### Binning functions: Cut and Qcut

In [105]:
# Assigns the pokemon a category based on their attack stats:
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.select('name', 'type1', 'attack').hstack(
    pokemon.get_column('attack')
    .cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True)
    .select('break_point', 'category')
).head(6)

name,type1,attack,break_point,category
str,str,i64,f64,cat
"""Bulbasaur""","""grass""",49,50.0,"""weak"""
"""Ivysaur""","""grass""",62,100.0,"""medium"""
"""Venusaur""","""grass""",100,100.0,"""medium"""
"""Charmander""","""fire""",52,100.0,"""medium"""
"""Charmeleon""","""fire""",64,100.0,"""medium"""
"""Charizard""","""fire""",104,inf,"""strong"""


In [104]:
# Assigns the pokemon a category based on their attack stats using percentiles:
# bottom 40% - weak, top 20% - strong, rest - medium

pokemon.select('name', 'type1', 'attack').hstack(
    pokemon.get_column('attack')
    .qcut(quantiles=[0.4,0.8], labels=['weak', 'medium', 'strong'], maintain_order=True)
    .select('break_point', 'category')
).head(6)

name,type1,attack,break_point,category
str,str,i64,f64,cat
"""Bulbasaur""","""grass""",49,65.0,"""weak"""
"""Ivysaur""","""grass""",62,65.0,"""weak"""
"""Venusaur""","""grass""",100,104.0,"""medium"""
"""Charmander""","""fire""",52,65.0,"""weak"""
"""Charmeleon""","""fire""",64,65.0,"""weak"""
"""Charizard""","""fire""",104,104.0,"""medium"""


#### Repeat

In [110]:
# Use a different dataframe for this example
# Based on a dataframe with 3 fruits and their order quantities
# Repeat the fruit names based on their order quantities

df = pl.DataFrame(
    {
        "fruit": ["Apple", "Banana", "Cherry"],
        "order_quantity": [1, 3, 2],
    }
)

df

fruit,order_quantity
str,i64
"""Apple""",1
"""Banana""",3
"""Cherry""",2


In [112]:
df.with_columns(
    fruits_repeated = pl.col('fruit').repeat_by('order_quantity')
)

fruit,order_quantity,fruits_repeated
str,i64,list[str]
"""Apple""",1,"[""Apple""]"
"""Banana""",3,"[""Banana"", ""Banana"", ""Banana""]"
"""Cherry""",2,"[""Cherry"", ""Cherry""]"


#### Explode and Implode

In [116]:
# Explode the list of pokemon abilities: from a column of list to a column of values
# Point of attention when using explode: explode increases the number of rows
# All columns in a Polars dataframe should have the same number of rows

pokemon.head(3).select(
    pl.col('abilities')
)

abilities
list[str]
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"


In [117]:
pokemon.head(3).select(
    pl.col('abilities').explode()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


In [36]:
# Implode is the opposite of explode, and aggregates values into lists

pokemon.head(3).select(
    pl.all().implode(),
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
list[str],list[str],list[str],list[list[str]],list[i64],list[i64],list[i64],list[i64],list[i64]
"[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]","[""grass"", ""grass"", ""grass""]","[""poison"", ""poison"", ""poison""]","[[""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""]]","[45, 60, 80]","[49, 62, 100]","[49, 63, 123]","[45, 60, 80]","[0, 0, 0]"


#### Conversion between structs and lists

In [39]:
# Combine type 1 and type 2 in a new struct column

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).head(5)

name,types
str,struct[2]
"""Bulbasaur""","{""grass"",""poison""}"
"""Ivysaur""","{""grass"",""poison""}"
"""Venusaur""","{""grass"",""poison""}"
"""Charmander""","{""fire"",null}"
"""Charmeleon""","{""fire"",null}"


In [40]:
# The struct column retains the type information of each initial column:
# Struct are views on the initial columns so the data is not copied
# The table schema shows that the names of the initial columns are retained 

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).schema

{'name': Utf8, 'types': Struct([Field('type1', Utf8), Field('type2', Utf8)])}

In [41]:
# structs are important because they allow us to do calculations on combined columns
# We can calculate the how many times each combination of types occurs

pokemon.select(
    types_count = pl.struct('type1', 'type2').value_counts()
).head(5)

types_count
struct[2]
"{{""grass"",""poison""},14}"
"{{""fire"",null},27}"
"{{""fire"",""flying""},6}"
"{{""water"",null},61}"
"{{""bug"",null},18}"


In [42]:
# Combine type 1 and type 2 in a new list column

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).head(5)

name,types
str,list[str]
"""Bulbasaur""","[""grass"", ""poison""]"
"""Ivysaur""","[""grass"", ""poison""]"
"""Venusaur""","[""grass"", ""poison""]"
"""Charmander""","[""fire"", null]"
"""Charmeleon""","[""fire"", null]"


In [43]:
# List columns do not retain the name of the initial columns
# In List columns, each row can have a different number of elements differently from Struct columns
# where each row has the same number of elements

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).schema

{'name': Utf8, 'types': List(Utf8)}

In [44]:
# List columns can be converted to a struct
# If number of elements are not the same, then the missing values are filled with null

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).head(5)

abilities
struct[6]
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"


In [45]:
# By default, the fields inside of the struct are called field_0, field_1, etc.

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).schema

{'abilities': Struct([Field('field_0', Utf8), Field('field_1', Utf8), Field('field_2', Utf8), Field('field_3', Utf8), Field('field_4', Utf8), Field('field_5', Utf8)])}

In [46]:
# It's possible to change the name with the argument fields, for example calling them ability_0, ability_1, etc.

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width', fields = lambda i: f'ability_{i}')
).schema

{'abilities': Struct([Field('ability_0', Utf8), Field('ability_1', Utf8), Field('ability_2', Utf8), Field('ability_3', Utf8), Field('ability_4', Utf8), Field('ability_5', Utf8)])}

In [47]:
# We can un-nest struct columns and expand them into multiple columns

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width')
).unnest('abilities').head(5)

field_0,field_1,field_2,field_3,field_4,field_5
str,str,str,str,str,str
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Blaze""","""Solar Power""",,,,
"""Blaze""","""Solar Power""",,,,


#### Pivot, Melt, Unstack and Transpose

In [48]:
# Pivot type 2 to the columns and count the number of pokemons for each type 1 and type 2 combination

pokemon.pivot(
    values = 'name',
    index = 'type1',
    columns = 'type2',
    aggregate_function=pl.element().count()
)


type1,poison,null,flying,dark,electric,ice,ground,fairy,grass,fighting,psychic,steel,fire,rock,water,dragon,ghost,bug,normal
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""grass""",14.0,37,6.0,3.0,,2.0,1.0,5.0,1.0,3.0,2.0,3.0,,,,,1.0,,
"""fire""",,27,6.0,1.0,,2.0,2.0,,,6.0,1.0,1.0,1.0,1.0,1.0,1.0,,,2.0
"""water""",3.0,61,7.0,4.0,2.0,3.0,9.0,4.0,3.0,2.0,5.0,1.0,,4.0,,2.0,2.0,2.0,
"""bug""",11.0,18,13.0,,4.0,,1.0,2.0,6.0,3.0,,5.0,2.0,3.0,3.0,,1.0,,
"""normal""",,61,26.0,4.0,,,1.0,4.0,2.0,2.0,3.0,,,,1.0,1.0,,,
"""poison""",2.0,13,3.0,3.0,,,2.0,,,2.0,,,2.0,,3.0,1.0,,1.0,
"""electric""",,26,3.0,,1.0,,,2.0,,,,4.0,,,,,1.0,,2.0
"""ground""",,10,3.0,3.0,1.0,2.0,2.0,,,,2.0,1.0,1.0,3.0,,2.0,2.0,,
"""fairy""",,16,2.0,,,,,,,,,,,,,,,,
"""fighting""",,22,1.0,1.0,,1.0,,,,,2.0,1.0,,,,,,,


In [49]:
# Pivot does not exist in lazy mode, however it's possible to reproduce it with group by
# In this case, we need to specify the resulting columns in the code

pokemon.lazy().groupby('type1').agg(
    pl.col('name').filter(pl.col('type2') == pokemon_type).count().alias(pokemon_type)
      for pokemon_type in ['poison', 'flying', 'dark', 'electric']
).collect()

type1,poison,flying,dark,electric
str,u32,u32,u32,u32
"""fire""",0,6,1,0
"""grass""",14,6,3,0
"""psychic""",0,6,0,0
"""flying""",0,0,0,0
"""dark""",0,5,0,0
"""water""",3,7,4,2
"""ice""",0,2,0,0
"""poison""",2,3,3,0
"""ghost""",3,2,1,0
"""dragon""",0,4,0,1


In [50]:
# Melt is the opposite of pivot, it brings the header of multiple columns into one column
# and their value in another column

pokemon.melt(
    id_vars = ['name', 'type1', 'type2'],
    value_vars = ['hp','attack', 'defense']
).sort(by='name').head(6)

name,type1,type2,variable,value
str,str,str,str,i64
"""Abomasnow""","""grass""","""ice""","""hp""",90
"""Abomasnow""","""grass""","""ice""","""attack""",132
"""Abomasnow""","""grass""","""ice""","""defense""",105
"""Abra""","""psychic""",,"""hp""",25
"""Abra""","""psychic""",,"""attack""",20
"""Abra""","""psychic""",,"""defense""",15


In [51]:
# Unstack breaks the dataframe into multiple groups of the same size 
# and moves these groups to new columns
# Here we split the dataframe into groups of 3, and add a column with the level of the pokemon

(
    pokemon
    .head(9)
    .unstack(columns = ['name', 'attack', 'defense'], step=3, how="vertical")
    .with_columns(
        level = pl.Series(range(1,4))
        )
)

name_0,name_1,name_2,attack_0,attack_1,attack_2,defense_0,defense_1,defense_2,level
str,str,str,i64,i64,i64,i64,i64,i64,i64
"""Bulbasaur""","""Charmander""","""Squirtle""",49,52,48,49,43,65,1
"""Ivysaur""","""Charmeleon""","""Wartortle""",62,64,63,63,58,80,2
"""Venusaur""","""Charizard""","""Blastoise""",100,104,103,123,78,120,3


In [52]:
# Transpose inverses the rows and columns of a dataframe
# It's a computationally expensive operation, so it should be used only if no other option is available

pokemon.head(3).select(
    'name',
    'type1',
    'type2'
).transpose()

column_0,column_1,column_2
str,str,str
"""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""grass""","""grass""","""grass"""
"""poison""","""poison""","""poison"""


#### Merge DataFrames: hstack, vstack, extend, concat, join, join_asof

In [53]:
# Horizontally stack 2 dataframes
# We have a new dataframe with the pokemon color

pokemon_color = pl.DataFrame({
    'color': ['green', 'green', 'green', 'red', 'red', 'red']
})

pokemon.head(6).hstack(pokemon_color)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""green"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""green"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""red"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""red"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""red"""


In [54]:
# Vertically stack 2 dataframes
# We have a new dataframe with a new pokemon

new_pokemon = pl.DataFrame({
    'name': ['Polarizard'],
    'type1': ['ice'],
    'type2': ['flying'],
    'abilities': [['snow warning', 'blaze']]
})

pokemon.select('name','type1','type2','abilities').vstack(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [55]:
# Extend is similar to vstack, but it copies the data instead of referencing it

pokemon.select('name','type1','type2','abilities').extend(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [56]:
# concat can reproduce both hstack and vstack when rechunk is set to False
# when rechunk is set to True, all data is copied to a contiguous memory space which allows it to be faster

pl.concat(
    [pokemon.select('name','type1','type2','abilities'), new_pokemon],
    rechunk = True,
    how= 'vertical'
).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [57]:
# concat diagonal stacks dataframes diagonally
# this means that columns missing from one dataframe are filled with nulls

new_pokemon_color = pl.DataFrame({
    'name': ['Polarizard'],
    'abilities': [['snow warning', 'blaze']],
    'color': ['white']
})

pl.concat([
    pokemon.select('name','type1','type2','abilities'),
    new_pokemon_color
    ],
    how = 'diagonal'
).tail(5)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Kartana""","""grass""","""steel""","[""Beast Boost""]",
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]",
"""Necrozma""","""psychic""",,"[""Prism Armor""]",
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]",
"""Polarizard""",,,"[""snow warning"", ""blaze""]","""white"""


In [58]:
# concat has another method, which is align.
# the align method ensures that the columns of the 2 dataframes are in the same order

pokemon_new_order = pl.DataFrame({
    'name' : ['Bulbasaur', 'Charmander', 'Squirtle'],
    'color': ['green', 'red', 'blue']
})

pl.concat([
    pokemon.select('name','type1',).head(9),
    pokemon_new_order
], how='align'
)

name,type1,color
str,str,str
"""Blastoise""","""water""",
"""Bulbasaur""","""grass""","""green"""
"""Charizard""","""fire""",
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""",
"""Ivysaur""","""grass""",
"""Squirtle""","""water""","""blue"""
"""Venusaur""","""grass""",
"""Wartortle""","""water""",


In [59]:
# join reproduces SQL joins, such as inner, left, outer, semi, anti, cross
# inner join keeps only the rows that are present in both dataframes

pokemon_new = pl.DataFrame({
    'name' : ['Bulbasaur', 'Polarizard'],
    'color': ['green', 'white']
})

pokemon.head(3).join(pokemon_new, on = 'name', how = 'inner')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""


In [60]:
# left join keeps all rows from the left dataframe and fills the missing values with nulls
# to keep the rows from the right dataframe, we can inverse the order of the dataframes

pokemon.head(3).join(pokemon_new, on = 'name', how = 'left')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,


In [61]:
# outer join keeps all rows from both dataframes and fills the missing values with nulls

pokemon.head(3).join(pokemon_new, on = 'name', how = 'outer')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45.0,49.0,49.0,45.0,0.0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60.0,62.0,63.0,60.0,0.0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80.0,100.0,123.0,80.0,0.0,
"""Polarizard""",,,,,,,,,"""white"""


In [62]:
# semi keeps the rows from the left dataframe that are present in the right dataframe
# it does not add any columns from the right dataframe (differently from inner)

pokemon.head(3).join(pokemon_new, on = 'name', how = 'semi')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0


In [63]:
# anti keeps the rows present in either dataframe but not in both
# it keeps the opposite rows as inner

pokemon.head(3).join(pokemon_new, on = 'name', how = 'anti')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


In [64]:
# cross combines all rows of the first dataframe with all rows of the second dataframe

pokemon_trainers = pl.DataFrame({
    'trainer': ['trainer1', 'trainer2']
})

pokemon.head(3).join(pokemon_trainers, how = 'cross')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,trainer
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer1"""
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer2"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer1"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer2"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""trainer1"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""trainer2"""


#### Custom functions: apply, map

In [65]:
# Both apply and map will be slower than using the native polars functions. 
# It's recommended to avoid apply and map whenever possible
# a common use case for apply and map is passing data to a third-party library
# map applies a function to a full column (which is a Polars series)
# apply does the same, but one row at a time
# map is faster than apply

In [66]:
# increase the attack by 10% for pokemons with attack < 50
# in the first 2 functions, we receive a column (a list) and process it using a Python list comprehension
# the first function returns a Python list, the second a Polars series
# the last function receives a single value and returns a single value

def simulated_attack_list(attack_column):
    return [attack * 1.1 if attack < 50 else attack for attack in attack_column]

def simulated_attack_series(attack_column):
    return pl.Series([attack * 1.1 if attack < 50 else attack for attack in attack_column])

def simulated_attack_single_value(attack_number):
    return attack_number * 1.1 if attack_number < 50 else attack_number

In [67]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_list)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,list[f64]
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"[53.9, 62.0, … 95.0]"
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"[53.9, 62.0, … 95.0]"
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"[53.9, 62.0, … 95.0]"


In [68]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_series)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,f64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,53.9
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,62.0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,100.0


In [69]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,f64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,53.9
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,62.0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,100.0


In [70]:
# let's check their speed
# we create a bigger dataframe by repeating the original one 100 times
# no need to test the first function, as the result is not what we want

pokemon_100 = pl.concat([pokemon] * 100, rechunk = True)

In [71]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_series)
)

15.4 ms ± 4.85 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [72]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
)

21.8 ms ± 3.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [73]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.when(pl.col('attack') < 50).then(pl.col('attack') * 1.1).otherwise(pl.col('attack'))
)

2.27 ms ± 826 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [74]:
# apply and map are considerably slower than the native Polars functions
# in this case by a factor of 10
# map is faster than apply because we operate on the full column instead of one row at a time

In [75]:
# inside groupby, we can use apply to apply a function to each group

def simulated_attack_list_to_scalar(attack_column):
    return sum([attack * 1.1 if attack < 50 else attack for attack in attack_column])

pokemon.head(6).groupby('type1').agg(
    pl.col('attack').apply(simulated_attack_list_to_scalar, return_dtype = pl.Float64)
)

type1,attack
str,f64
"""grass""",215.9
"""fire""",220.0


In [76]:
# if we want to apply a function to multiple columns, we can use pl.struct
# to create a struct, then use apply with it

pokemon.head(6).with_columns(
    attack_plus_defense = pl.struct('attack','defense').apply(lambda columns: columns['attack'] + columns['defense'])
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,attack_plus_defense
str,str,str,list[str],i64,i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,98
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,125
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,223
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,95
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,122
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,182


In [77]:
# we can also use lru_cache to cache the results of a function
# this is useful when we have a function that is called multiple times with the same arguments
# in this case, we have a function that returns a list of the characters of the type, repeated 10 times 

from functools import lru_cache

@lru_cache(maxsize = 2048)
def modify_type_cached(name):
    return list(name)*10

def modify_type(name):
    return list(name)*10

In [78]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type)
)

1.64 s ± 183 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [79]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type_cached)
)

1.49 s ± 137 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
