### Advanced Data Transformation with Polars

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code.

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [196]:
import polars as pl

# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [197]:
pokemon = pl.read_parquet('../datasets/pokemon_simplified.parquet')

In [198]:
pokemon.head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


#### Conditions

In [199]:
# Identify the top 25% of Pokemon based on the attack stats
# Use the quantile function to get the 75th percentile

pokemon.with_columns(
    attack_category = pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%')
).head(8)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,attack_category
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""Top 25%"""
"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,0,
"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,0,


In [200]:
# Identify the top 25% of Pokemon based on the attack stats
# Use otherwise to assign the remaining 75% of Pokemon to the 'Other' category

pokemon.with_columns(
    attack_category = pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%').otherwise('Other')
).head(8)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,attack_category
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""Other"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""Other"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""Other"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""Other"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""Other"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""Top 25%"""
"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,0,"""Other"""
"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,0,"""Other"""


In [201]:
# Identify the top 25% of Pokemon based on the attack stats AND the bottom 25%
# Use multiple when then statements

pokemon.with_columns(
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%')
        .when(pl.col('attack') < pl.col('attack').quantile(0.25)).then('Bottom 25%')
        .otherwise('Other')
).head(8)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,attack_category
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""Bottom 25%"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""Other"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""Other"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""Bottom 25%"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""Other"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""Top 25%"""
"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,0,"""Bottom 25%"""
"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,0,"""Other"""


#### Convert columns to dummies

In [202]:
# Convert the column type1 to dummies

pokemon.head(8).to_dummies(columns=['type1'])

name,type1_fire,type1_grass,type1_water,type2,abilities,hp,attack,defense,speed,is_legendary
str,u8,u8,u8,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0
"""Charmander""",1,0,0,,"[""Blaze"", ""Solar Power""]",39,52,43,65,0
"""Charmeleon""",1,0,0,,"[""Blaze"", ""Solar Power""]",58,64,58,80,0
"""Charizard""",1,0,0,"""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0
"""Squirtle""",0,0,1,,"[""Torrent"", ""Rain Dish""]",44,48,65,43,0
"""Wartortle""",0,0,1,,"[""Torrent"", ""Rain Dish""]",59,63,80,58,0


#### Grouping and aggregating data

In [203]:
# Group by type and calculate median stats
# Group by is run in parallel so the order of the rows can change

pokemon.groupby('type1').agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""fairy""",76.0,58.5,67.5,47.0
"""grass""",65.0,68.0,66.0,55.0
"""poison""",65.0,68.5,66.0,62.5


In [204]:
# Group by type and calculate median stats
# Maintaining the order is possible but it is slower

pokemon.groupby('type1', maintain_order=True).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""grass""",65.0,68.0,66.0,55.0
"""fire""",65.0,79.5,61.5,72.5
"""water""",68.0,70.0,70.0,64.0


In [205]:
# Group on multiple columns works the same way

pokemon.groupby(['type1', 'is_legendary']).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median().round(0)
).head(3)

type1,is_legendary,hp,attack,defense,speed
str,i64,f64,f64,f64,f64
"""grass""",0,63.0,66.0,65.0,54.0
"""fighting""",0,70.0,100.0,66.0,60.0
"""ghost""",0,59.0,66.0,78.0,50.0


In [206]:
# Lots of aggregations functions are available

pokemon.groupby('type1').agg(
    avg_attack = pl.col('attack').mean(),
    median_attack = pl.col('attack').median(),
    min_attack = pl.col('attack').min(),
    max_attack = pl.col('attack').max(),
    quantile_25 = pl.col('attack').quantile(0.25),
    quantile_75 = pl.col('attack').quantile(0.75),
    count = pl.count(), # Count the number of pokemon in each group
    first_pokemon = pl.first('name') # Get the first pokemon in each group
).head(3)

type1,avg_attack,median_attack,min_attack,max_attack,quantile_25,quantile_75,count,first_pokemon
str,f64,f64,i64,i64,f64,f64,u32,str
"""fire""",81.5,79.5,30,160,63.0,100.0,52,"""Charmander"""
"""dark""",87.793103,88.0,50,150,65.0,101.0,29,"""Umbreon"""
"""bug""",70.125,65.0,10,185,45.0,90.0,72,"""Caterpie"""


In [207]:
# Possible to use Numpy universal functions (ufuncs) in aggregations

import numpy as np

pokemon.groupby('type1').agg(
    sqrt_attack_mean = np.sqrt(pl.col('attack')).mean() # sqrt is the square root function
).head(3)

type1,sqrt_attack_mean
str,f64
"""ground""",9.608588
"""ghost""",8.335305
"""fire""",8.905868


In [208]:
# If aggregation functions are not specified, the result if aggregated in a list

pokemon.groupby('type1').agg(
    pl.col('name')
).head(3)

type1,name
str,list[str]
"""ice""","[""Jynx"", ""Articuno"", … ""Avalugg""]"
"""flying""","[""Tornadus"", ""Noibat"", ""Noivern""]"
"""dragon""","[""Dratini"", ""Dragonair"", … ""Kommo-o""]"


In [209]:
# Concatenate the names of the pokemon into a string

pokemon.groupby('type1').agg(
    pl.col('name').str.concat(', ')
).head(3)

type1,name
str,str
"""fire""","""Charmander, Charmeleon, Chari…"
"""flying""","""Tornadus, Noibat, Noivern"""
"""ground""","""Sandshrew, Sandslash, Diglett…"


In [210]:
# Aggregation with conditions
# Calculate the average attack for Pokemons with a speed above 100, by type

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_100 = pl.col('attack').filter(pl.col('speed') > 100).mean()
).head(3)


type1,avg_attack_when_speed_above_100
str,f64
"""electric""",79.25
"""water""",105.833333
"""fighting""",120.666667


In [211]:
# We can also create functions to use in aggregations

def avg_attack_when_speed_above_threshold(speed):
    return pl.col('attack').filter(pl.col('speed') > speed).mean().alias(f'avg_attack_when_speed_above_{speed}')

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_threshold(speed) for speed in range(20,100,20)
).head(3)

type1,avg_attack_when_speed_above_20,avg_attack_when_speed_above_40,avg_attack_when_speed_above_60,avg_attack_when_speed_above_80
str,f64,f64,f64,f64
"""water""",73.756757,75.282609,78.372881,85.448276
"""ground""",94.645161,97.952381,98.733333,111.333333
"""fighting""",99.178571,105.086957,106.076923,116.0


In [212]:
# Show the top 3 pokemon by attack in each group
# We sort before aggregation to make sure we get the top 3
(
    pokemon
    .sort(by='attack', descending=True)
    .groupby('type1').agg(
        top_3_pokemon_by_attack = pl.col('name').head(3) # head(3) filters top 3 pokemons per type
    )
).head(5) # head(5) filters 5 types

type1,top_3_pokemon_by_attack
str,list[str]
"""electric""","[""Electivire"", ""Luxray"", ""Eelektross""]"
"""rock""","[""Rampardos"", ""Tyranitar"", ""Diancie""]"
"""ice""","[""Mamoswine"", ""Beartic"", ""Glalie""]"
"""dragon""","[""Rayquaza"", ""Garchomp"", ""Zekrom""]"
"""water""","[""Gyarados"", ""Swampert"", ""Kyogre""]"


In [213]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health in each group
(
    pokemon
    .groupby('type1').agg(
        top_3_pokemon_by_attack = pl.col('name').sort_by('attack', descending=True).head(3),
        top_3_pokemon_by_defense = pl.col('name').sort_by('defense', descending=True).head(3) 
    )
).head(5)

type1,top_3_pokemon_by_attack,top_3_pokemon_by_defense
str,list[str],list[str]
"""psychic""","[""Gallade"", ""Hoopa"", ""Mewtwo""]","[""Cosmoem"", ""Lugia"", ""Uxie""]"
"""ground""","[""Groudon"", ""Landorus"", ""Rhyperior""]","[""Groudon"", ""Rhyperior"", ""Gliscor""]"
"""fighting""","[""Lucario"", ""Conkeldurr"", ""Crabominable""]","[""Hitmontop"", ""Conkeldurr"", ""Passimian""]"
"""grass""","[""Kartana"", ""Abomasnow"", ""Breloom""]","[""Ferrothorn"", ""Kartana"", ""Leafeon""]"
"""fairy""","[""Xerneas"", ""Granbull"", ""Snubbull""]","[""Togekiss"", ""Xerneas"", ""Comfey""]"


In [214]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health without grouping by type

pokemon.select(
    top_3_pokemon_by_attack = pl.col('name').sort_by('attack', descending=True).head(3).implode(),
    top_3_pokemon_by_defense = pl.col('name').sort_by('defense', descending=True).head(3).implode()
)

top_3_pokemon_by_attack,top_3_pokemon_by_defense
list[str],list[str]
"[""Heracross"", ""Kartana"", ""Groudon""]","[""Steelix"", ""Shuckle"", ""Aggron""]"


In [215]:
# I can iterate over the groups and do any action on them

for name, data in pokemon.groupby("is_legendary"):
    print(name)
    print(data.head(2))

1
shape: (2, 9)
┌──────────┬──────────┬────────┬─────────────────────┬───┬────────┬─────────┬───────┬──────────────┐
│ name     ┆ type1    ┆ type2  ┆ abilities           ┆ … ┆ attack ┆ defense ┆ speed ┆ is_legendary │
│ ---      ┆ ---      ┆ ---    ┆ ---                 ┆   ┆ ---    ┆ ---     ┆ ---   ┆ ---          │
│ str      ┆ str      ┆ str    ┆ list[str]           ┆   ┆ i64    ┆ i64     ┆ i64   ┆ i64          │
╞══════════╪══════════╪════════╪═════════════════════╪═══╪════════╪═════════╪═══════╪══════════════╡
│ Articuno ┆ ice      ┆ flying ┆ ["Pressure", "Snow  ┆ … ┆ 85     ┆ 100     ┆ 85    ┆ 1            │
│          ┆          ┆        ┆ Cloak"]             ┆   ┆        ┆         ┆       ┆              │
│ Zapdos   ┆ electric ┆ flying ┆ ["Pressure",        ┆ … ┆ 90     ┆ 85      ┆ 100   ┆ 1            │
│          ┆          ┆        ┆ "Static"]           ┆   ┆        ┆         ┆       ┆              │
└──────────┴──────────┴────────┴─────────────────────┴───┴────────┴────────

#### Window functions

In [216]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1

pokemon.select(
    'name', 'type1', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1')
).head(5)

name,type1,attack,avg_attack_for_same_type1
str,str,i64,f64
"""Bulbasaur""","""grass""",49,73.769231
"""Ivysaur""","""grass""",62,73.769231
"""Venusaur""","""grass""",100,73.769231
"""Charmander""","""fire""",52,81.5
"""Charmeleon""","""fire""",64,81.5


In [217]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1 and pokemons of the same type 2

pokemon.select(
    'name', 'type1', 'type2', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1'),
    avg_attack_for_same_type2 = pl.mean('attack').over('type2')
).head(5)

name,type1,type2,attack,avg_attack_for_same_type1,avg_attack_for_same_type2
str,str,str,i64,f64,f64
"""Bulbasaur""","""grass""","""poison""",49,73.769231,67.617647
"""Ivysaur""","""grass""","""poison""",62,73.769231,67.617647
"""Venusaur""","""grass""","""poison""",100,73.769231,67.617647
"""Charmander""","""fire""",,52,81.5,74.231771
"""Charmeleon""","""fire""",,64,81.5,74.231771


In [218]:
# For each pokemon, show the top 3 pokemon by attack of the same type

pokemon.select(
    'name', 'type1', 'attack',
    top_3_attack_pokemon_same_type = pl.col('name').sort_by('attack', descending=True).head(3).over('type1', mapping_strategy='join')
).head(5)

name,type1,attack,top_3_attack_pokemon_same_type
str,str,i64,list[str]
"""Bulbasaur""","""grass""",49,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Ivysaur""","""grass""",62,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Venusaur""","""grass""",100,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Charmander""","""fire""",52,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"
"""Charmeleon""","""fire""",64,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"


In [219]:
# Show top 3 pokemon by attack and defense

pokemon.select(
    pl.col('type1').head(3).over('type1', mapping_strategy='explode'),
    # flatten changes 2D lists (each row has a list of 3 pokemons) to 1D array (each pokemon in a separate row)
    top_3_attack = pl.col('name').sort_by('attack', descending=True).head(3).over('type1', mapping_strategy='explode'), 
    top_3_defense = pl.col('name').sort_by('defense', descending=True).head(3).over('type1', mapping_strategy='explode')
).head(6)

type1,top_3_attack,top_3_defense
str,str,str
"""grass""","""Kartana""","""Ferrothorn"""
"""grass""","""Abomasnow""","""Kartana"""
"""grass""","""Breloom""","""Leafeon"""
"""fire""","""Blaziken""","""Torkoal"""
"""fire""","""Flareon""","""Turtonator"""
"""fire""","""Ho-Oh""","""Magcargo"""


In [220]:
# Rank the pokemon by attack and defense by type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    attack_rank = pl.col('attack').rank(method='ordinal', descending=True).over('type1'),
    defense_rank = pl.col('defense').rank(method='ordinal', descending=True).over('type1')
).head(6)

name,type1,attack,defense,attack_rank,defense_rank
str,str,i64,i64,u32,u32
"""Bulbasaur""","""grass""",49,49,61,62
"""Ivysaur""","""grass""",62,63,45,45
"""Venusaur""","""grass""",100,123,16,5
"""Charmander""","""fire""",52,43,45,44
"""Charmeleon""","""fire""",64,58,37,29
"""Charizard""","""fire""",104,78,10,14


#### Range function

In [221]:
# Add an increasing index for the pokemon of each type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    type_index = pl.arange(1, pl.count()+1).over('type1')
).head(6)

name,type1,attack,defense,type_index
str,str,i64,i64,i64
"""Bulbasaur""","""grass""",49,49,1
"""Ivysaur""","""grass""",62,63,2
"""Venusaur""","""grass""",100,123,3
"""Charmander""","""fire""",52,43,1
"""Charmeleon""","""fire""",64,58,2
"""Charizard""","""fire""",104,78,3


In [222]:
# Take a random sample of 3 pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count()+1).shuffle().over("type1") <= 3
).sort(by='type1').head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Kricketot""","""bug""",,"[""Shed Skin"", ""Run Away""]",37,25,41,25,0
"""Swadloon""","""bug""","""grass""","[""Leaf Guard"", ""Chlorophyll"", ""Overcoat""]",55,63,90,42,0
"""Vivillon""","""bug""","""flying""","[""Shield Dust"", ""Compoundeyes"", ""Friend Guard""]",80,52,50,89,0
"""Zorua""","""dark""",,"[""Illusion""]",40,65,40,65,0
"""Pawniard""","""dark""","""steel""","[""Defiant"", ""Inner Focus"", ""Pressure""]",45,85,70,60,0
"""Inkay""","""dark""","""psychic""","[""Contrary"", ""Suction Cups"", ""Infiltrator""]",53,54,53,45,0


In [223]:
# Take a random sample of 5% of pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).shuffle().over("type1") <= pl.count().over("type1") * 0.05
).sort(by="type1").head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Paras""","""bug""","""grass""","[""Effect Spore"", ""Dry Skin"", ""Damp""]",35,70,55,25,0
"""Vespiquen""","""bug""","""flying""","[""Pressure"", ""Unnerve""]",70,80,102,40,0
"""Whirlipede""","""bug""","""poison""","[""Poison Point"", ""Swarm"", ""Speed Boost""]",40,55,99,47,0
"""Murkrow""","""dark""","""flying""","[""Insomnia"", ""Super Luck"", ""Prankster""]",60,85,42,91,0
"""Dragonair""","""dragon""",,"[""Shed Skin"", ""Marvel Scale""]",61,84,65,70,0
"""Zapdos""","""electric""","""flying""","[""Pressure"", ""Static""]",90,90,85,100,1


In [224]:
# Get the 5th pokemon for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).over("type1") == 5
).head(5)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Kakuna""","""bug""","""poison""","[""Shed Skin""]",45,25,50,35,0
"""Raticate""","""normal""","""dark""","[""Run Away"", ""Guts"", … ""Thick Fat""]",75,71,70,77,0
"""Nidoqueen""","""poison""","""ground""","[""Poison Point"", ""Rivalry"", ""Sheer Force""]",90,92,87,76,0
"""Ninetales""","""fire""","""ice""","[""Flash Fire"", ""Drought"", … ""Snow Warning""]",73,67,75,109,0
"""Gloom""","""grass""","""poison""","[""Chlorophyll"", ""Stench""]",60,65,70,40,0


#### Binning functions: Cut and Qcut

In [225]:
# Bin the attack column into 3 bins
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.get_column('attack').cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True).head(5)

attack,break_point,category
f64,f64,cat
49.0,50.0,"""weak"""
62.0,100.0,"""medium"""
100.0,100.0,"""medium"""
52.0,100.0,"""medium"""
64.0,100.0,"""medium"""


In [226]:
# Assigns the pokemon a category based on their attack stats:
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.with_columns(
    'name', 'type1', 'attack',
    pokemon.get_column('attack').cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True).get_column('category')
).head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,category
str,str,str,list[str],i64,i64,i64,i64,i64,cat
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""weak"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""medium"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""medium"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""medium"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""medium"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""strong"""


In [227]:
# Assigns the pokemon a category based on their attack stats using percentiles:
# bottom 40% - weak, top 20% - strong, rest - medium

pokemon.with_columns(
    'name', 'type1', 'attack',
    pokemon.get_column('attack').qcut(quantiles=[0.4,0.8], labels=['weak', 'medium', 'strong'], maintain_order=True).get_column('category')
).head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,category
str,str,str,list[str],i64,i64,i64,i64,i64,cat
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""weak"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""weak"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""medium"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""weak"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""weak"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""medium"""


#### Repeat

In [228]:
# Use a different dataframe for this example
# Based on a dataframe with 3 fruits and their order quantities
# Repeat the fruit names based on their order quantities

df = pl.DataFrame(
    {
        "Fruit": ["Apple", "Banana", "Cherry"],
        "Order_Quantity": [1, 3, 2],
    }
)

df.select(
    pl.col('Fruit').repeat_by('Order_Quantity').flatten()
    )

Fruit
str
"""Apple"""
"""Banana"""
"""Banana"""
"""Banana"""
"""Cherry"""
"""Cherry"""


#### Explode, Implode and Flatten

In [229]:
# Explode the list of pokemon abilities: from a column of list to a column of values
# Point of attention when using explode: explode increases the number of rows
# All columns in a Polars dataframe should have the same number of rows

pokemon.head(3).select(
    pl.col('abilities').explode()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


In [230]:
# Implode is the opposite of explode, and aggregates values into lists

pokemon.head(3).select(
    pl.all().implode(),
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
list[str],list[str],list[str],list[list[str]],list[i64],list[i64],list[i64],list[i64],list[i64]
"[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]","[""grass"", ""grass"", ""grass""]","[""poison"", ""poison"", ""poison""]","[[""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""]]","[45, 60, 80]","[49, 62, 100]","[49, 63, 123]","[45, 60, 80]","[0, 0, 0]"


In [231]:
# Flatten removes a level of nesting from a column of lists
# If there is no nesting, then it acts as explodes

pokemon.head(3).select(
    pl.all().implode().flatten()
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


In [232]:
# When there is no nesting, flatten acts as explode

pokemon.head(3).select(
    pl.col('abilities').flatten()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


#### Conversion between structs and lists

In [233]:
# Combine type 1 and type 2 in a new struct column

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).head(5)

name,types
str,struct[2]
"""Bulbasaur""","{""grass"",""poison""}"
"""Ivysaur""","{""grass"",""poison""}"
"""Venusaur""","{""grass"",""poison""}"
"""Charmander""","{""fire"",null}"
"""Charmeleon""","{""fire"",null}"


In [234]:
# The struct column retains the type information of each initial column:
# Struct are views on the initial columns so the data is not copied
# The table schema shows that the names of the initial columns are retained 

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).schema

{'name': Utf8, 'types': Struct([Field('type1', Utf8), Field('type2', Utf8)])}

In [235]:
# structs are important because they allow us to do calculations on combined columns
# We can calculate the how many times each combination of types occurs

pokemon.select(
    types_count = pl.struct('type1', 'type2').value_counts()
).head(5)

types_count
struct[2]
"{{""grass"",""poison""},14}"
"{{""fire"",null},27}"
"{{""fire"",""flying""},6}"
"{{""water"",null},61}"
"{{""bug"",null},18}"


In [236]:
# Combine type 1 and type 2 in a new list column

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).head(5)

name,types
str,list[str]
"""Bulbasaur""","[""grass"", ""poison""]"
"""Ivysaur""","[""grass"", ""poison""]"
"""Venusaur""","[""grass"", ""poison""]"
"""Charmander""","[""fire"", null]"
"""Charmeleon""","[""fire"", null]"


In [237]:
# List columns do not retain the name of the initial columns
# In List columns, each row can have a different number of elements differently from Struct columns
# where each row has the same number of elements

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).schema

{'name': Utf8, 'types': List(Utf8)}

In [238]:
# List columns can be converted to a struct
# If number of elements are not the same, then the missing values are filled with null

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).head(5)

abilities
struct[6]
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"


In [239]:
# By default, the fields inside of the struct are called field_0, field_1, etc.

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).schema

{'abilities': Struct([Field('field_0', Utf8), Field('field_1', Utf8), Field('field_2', Utf8), Field('field_3', Utf8), Field('field_4', Utf8), Field('field_5', Utf8)])}

In [240]:
# It's possible to change the name with the argument fields, for example calling them ability_0, ability_1, etc.

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width', fields = lambda i: f'ability_{i}')
).schema

{'abilities': Struct([Field('ability_0', Utf8), Field('ability_1', Utf8), Field('ability_2', Utf8), Field('ability_3', Utf8), Field('ability_4', Utf8), Field('ability_5', Utf8)])}

In [241]:
# We can un-nest struct columns and expand them into multiple columns

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width')
).unnest('abilities').head(5)

field_0,field_1,field_2,field_3,field_4,field_5
str,str,str,str,str,str
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Blaze""","""Solar Power""",,,,
"""Blaze""","""Solar Power""",,,,


#### Pivot, Melt, Unstack and Transpose

In [242]:
# Pivot type 2 to the columns and count the number of pokemons for each type 1 and type 2 combination

pokemon.pivot(
    values = 'name',
    index = 'type1',
    columns = 'type2',
    aggregate_function=pl.element().count()
)


type1,poison,null,flying,dark,electric,ice,ground,fairy,grass,fighting,psychic,steel,fire,rock,water,dragon,ghost,bug,normal
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""grass""",14.0,37,6.0,3.0,,2.0,1.0,5.0,1.0,3.0,2.0,3.0,,,,,1.0,,
"""fire""",,27,6.0,1.0,,2.0,2.0,,,6.0,1.0,1.0,1.0,1.0,1.0,1.0,,,2.0
"""water""",3.0,61,7.0,4.0,2.0,3.0,9.0,4.0,3.0,2.0,5.0,1.0,,4.0,,2.0,2.0,2.0,
"""bug""",11.0,18,13.0,,4.0,,1.0,2.0,6.0,3.0,,5.0,2.0,3.0,3.0,,1.0,,
"""normal""",,61,26.0,4.0,,,1.0,4.0,2.0,2.0,3.0,,,,1.0,1.0,,,
"""poison""",2.0,13,3.0,3.0,,,2.0,,,2.0,,,2.0,,3.0,1.0,,1.0,
"""electric""",,26,3.0,,1.0,,,2.0,,,,4.0,,,,,1.0,,2.0
"""ground""",,10,3.0,3.0,1.0,2.0,2.0,,,,2.0,1.0,1.0,3.0,,2.0,2.0,,
"""fairy""",,16,2.0,,,,,,,,,,,,,,,,
"""fighting""",,22,1.0,1.0,,1.0,,,,,2.0,1.0,,,,,,,


In [243]:
# Pivot does not exist in lazy mode, however it's possible to reproduce it with group by
# In this case, we need to specify the resulting columns in the code

pokemon.lazy().groupby('type1').agg(
    pl.col('name').filter(pl.col('type2') == pokemon_type).count().alias(pokemon_type)
      for pokemon_type in ['poison', 'flying', 'dark', 'electric']
).collect()

type1,poison,flying,dark,electric
str,u32,u32,u32,u32
"""ice""",0,2,0,0
"""dragon""",0,4,0,1
"""bug""",11,13,0,4
"""fighting""",0,1,1,0
"""dark""",0,5,0,0
"""ground""",0,3,3,1
"""psychic""",0,6,0,0
"""grass""",14,6,3,0
"""electric""",0,3,0,1
"""fire""",0,6,1,0


In [244]:
# Melt is the opposite of pivot, it brings the header of multiple columns into one column
# and their value in another column

pokemon.melt(
    id_vars = ['name', 'type1', 'type2'],
    value_vars = ['hp','attack', 'defense']
).sort(by='name').head(6)

name,type1,type2,variable,value
str,str,str,str,i64
"""Abomasnow""","""grass""","""ice""","""hp""",90
"""Abomasnow""","""grass""","""ice""","""attack""",132
"""Abomasnow""","""grass""","""ice""","""defense""",105
"""Abra""","""psychic""",,"""hp""",25
"""Abra""","""psychic""",,"""attack""",20
"""Abra""","""psychic""",,"""defense""",15


In [245]:
# Unstack breaks the dataframe into multiple groups of the same size 
# and moves these groups to new columns
# Here we split the dataframe into groups of 3, and add a column with the level of the pokemon

(
    pokemon
    .head(9)
    .unstack(columns = ['name', 'attack', 'defense'], step=3, how="vertical")
    .with_columns(
        level = pl.Series(range(1,4))
        )
)

name_0,name_1,name_2,attack_0,attack_1,attack_2,defense_0,defense_1,defense_2,level
str,str,str,i64,i64,i64,i64,i64,i64,i64
"""Bulbasaur""","""Charmander""","""Squirtle""",49,52,48,49,43,65,1
"""Ivysaur""","""Charmeleon""","""Wartortle""",62,64,63,63,58,80,2
"""Venusaur""","""Charizard""","""Blastoise""",100,104,103,123,78,120,3


In [246]:
# Transpose inverses the rows and columns of a dataframe
# It's a computationally expensive operation, so it should be used only if no other option is available

pokemon.head(3).select(
    'name',
    'type1',
    'type2'
).transpose()

column_0,column_1,column_2
str,str,str
"""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""grass""","""grass""","""grass"""
"""poison""","""poison""","""poison"""


#### Merge DataFrames: hstack, vstack, extend, concat, join, join_asof

In [247]:
# Horizontally stack 2 dataframes
# We have a new dataframe with the pokemon color

pokemon_color = pl.DataFrame({
    'color': ['green', 'green', 'green', 'red', 'red', 'red']
})

pokemon.head(6).hstack(pokemon_color)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""green"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""green"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,"""red"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,"""red"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,"""red"""


In [248]:
# Vertically stack 2 dataframes
# We have a new dataframe with a new pokemon

new_pokemon = pl.DataFrame({
    'name': ['Polarizard'],
    'type1': ['ice'],
    'type2': ['flying'],
    'abilities': [['snow warning', 'blaze']]
})

pokemon.select('name','type1','type2','abilities').vstack(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [249]:
# Extend is similar to vstack, but it copies the data instead of referencing it

pokemon.select('name','type1','type2','abilities').extend(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [250]:
# concat can reproduce both hstack and vstack when rechunk is set to False
# when rechunk is set to True, all data is copied to a contiguous memory space which allows it to be faster

pl.concat(
    [pokemon.select('name','type1','type2','abilities'), new_pokemon],
    rechunk = True,
    how= 'vertical'
).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [251]:
# concat diagonal stacks dataframes diagonally
# this means that columns missing from one dataframe are filled with nulls

new_pokemon_color = pl.DataFrame({
    'name': ['Polarizard'],
    'abilities': [['snow warning', 'blaze']],
    'color': ['white']
})

pl.concat([
    pokemon.select('name','type1','type2','abilities'),
    new_pokemon_color
    ],
    how = 'diagonal'
).tail(5)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Kartana""","""grass""","""steel""","[""Beast Boost""]",
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]",
"""Necrozma""","""psychic""",,"[""Prism Armor""]",
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]",
"""Polarizard""",,,"[""snow warning"", ""blaze""]","""white"""


In [252]:
# concat has another method, which is align.
# the align method ensures that the columns of the 2 dataframes are in the same order

pokemon_new_order = pl.DataFrame({
    'name' : ['Bulbasaur', 'Charmander', 'Squirtle'],
    'color': ['green', 'red', 'blue']
})

pl.concat([
    pokemon.select('name','type1',).head(9),
    pokemon_new_order
], how='align'
)

name,type1,color
str,str,str
"""Blastoise""","""water""",
"""Bulbasaur""","""grass""","""green"""
"""Charizard""","""fire""",
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""",
"""Ivysaur""","""grass""",
"""Squirtle""","""water""","""blue"""
"""Venusaur""","""grass""",
"""Wartortle""","""water""",


In [253]:
# join reproduces SQL joins, such as inner, left, outer, semi, anti, cross
# inner join keeps only the rows that are present in both dataframes

pokemon_new = pl.DataFrame({
    'name' : ['Bulbasaur', 'Polarizard'],
    'color': ['green', 'white']
})

pokemon.head(3).join(pokemon_new, on = 'name', how = 'inner')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""


In [254]:
# left join keeps all rows from the left dataframe and fills the missing values with nulls
# to keep the rows from the right dataframe, we can inverse the order of the dataframes

pokemon.head(3).join(pokemon_new, on = 'name', how = 'left')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,


In [255]:
# outer join keeps all rows from both dataframes and fills the missing values with nulls

pokemon.head(3).join(pokemon_new, on = 'name', how = 'outer')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45.0,49.0,49.0,45.0,0.0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60.0,62.0,63.0,60.0,0.0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80.0,100.0,123.0,80.0,0.0,
"""Polarizard""",,,,,,,,,"""white"""


In [256]:
# semi keeps the rows from the left dataframe that are present in the right dataframe
# it does not add any columns from the right dataframe (differently from inner)

pokemon.head(3).join(pokemon_new, on = 'name', how = 'semi')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0


In [257]:
# anti keeps the rows present in either dataframe but not in both
# it keeps the opposite rows as inner

pokemon.head(3).join(pokemon_new, on = 'name', how = 'anti')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


In [258]:
# cross combines all rows of the first dataframe with all rows of the second dataframe

pokemon_trainers = pl.DataFrame({
    'trainer': ['trainer1', 'trainer2']
})

pokemon.head(3).join(pokemon_trainers, how = 'cross')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,trainer
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer1"""
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer2"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer1"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer2"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""trainer1"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"""trainer2"""


#### Custom functions: apply, map

In [259]:
# Both apply and map will be slower than using the native polars functions. 
# It's recommended to avoid apply and map whenever possible
# a common use case for apply and map is passing data to a third-party library
# map applies a function to a full column (which is a Polars series)
# apply does the same, but one row at a time
# map is faster than apply

In [260]:
# increase the attack by 10% for pokemons with attack < 50
# in the first 2 functions, we receive a column (a list) and process it using a Python list comprehension
# the first function returns a Python list, the second a Polars series
# the last function receives a single value and returns a single value

def simulated_attack_list(attack_column):
    return [attack * 1.1 if attack < 50 else attack for attack in attack_column]

def simulated_attack_series(attack_column):
    return pl.Series([attack * 1.1 if attack < 50 else attack for attack in attack_column])

def simulated_attack_single_value(attack_number):
    return attack_number * 1.1 if attack_number < 50 else attack_number

In [261]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_list)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,list[f64]
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"[53.9, 62.0, … 95.0]"
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"[53.9, 62.0, … 95.0]"
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,"[53.9, 62.0, … 95.0]"


In [262]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_series)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,f64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,53.9
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,62.0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,100.0


In [263]:
pokemon.with_columns(
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
).head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,simulated_attack
str,str,str,list[str],i64,i64,i64,i64,i64,f64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,53.9
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,62.0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,100.0


In [264]:
# let's check their speed
# we create a bigger dataframe by repeating the original one 100 times
# no need to test the first function, as the result is not what we want

pokemon_100 = pl.concat([pokemon] * 100, rechunk = True)

In [265]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.col('attack').map(simulated_attack_series)
)

15 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [266]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
)

20.1 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [267]:
%%timeit
pokemon_100.with_columns(
    simulated_attack = pl.when(pl.col('attack') < 50).then(pl.col('attack') * 1.1).otherwise(pl.col('attack'))
)

2.04 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [268]:
# apply and map are considerably slower than the native Polars functions
# in this case by a factor of 10
# map is faster than apply because we operate on the full column instead of one row at a time

In [269]:
# inside groupby, we can use apply to apply a function to each group

def simulated_attack_list_to_scalar(attack_column):
    return sum([attack * 1.1 if attack < 50 else attack for attack in attack_column])

pokemon.head(6).groupby('type1').agg(
    pl.col('attack').apply(simulated_attack_list_to_scalar, return_dtype = pl.Float64)
)

type1,attack
str,f64
"""fire""",220.0
"""grass""",215.9


In [270]:
# if we want to apply a function to multiple columns, we can use pl.struct
# to create a struct, then use apply with it

pokemon.head(6).with_columns(
    attack_plus_defense = pl.struct('attack','defense').apply(lambda columns: columns['attack'] + columns['defense'])
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,attack_plus_defense
str,str,str,list[str],i64,i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,98
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,125
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,223
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,0,95
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,0,122
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,0,182


In [271]:
# we can also use lru_cache to cache the results of a function
# this is useful when we have a function that is called multiple times with the same arguments
# in this case, we have a function that returns a list of the characters of the type, repeated 10 times 

from functools import lru_cache

@lru_cache(maxsize = 2048)
def modify_type_cached(name):
    return list(name)*10

def modify_type(name):
    return list(name)*10

In [272]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type)
)

1.52 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [273]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type_cached)
)

1.36 s ± 79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
