### Advanced Data Transformation with Polars

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code.

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [1]:
import polars as pl

# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [2]:
pokemon = pl.read_parquet('../datasets/pokemon_simplified.parquet')

In [3]:
pokemon.head(2)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0


#### Conditions

In [4]:
# Identify the top 25% of Pokemon based on the attack stats
# Use the quantile function to get the 75th percentile

pokemon.with_columns(
    attack_category = pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%')
).head(8)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,attack_category
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""Top 25%"""
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0,
8,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0,


In [5]:
# Identify the top 25% of Pokemon based on the attack stats
# Use otherwise to assign the remaining 75% of Pokemon to the 'Other' category

pokemon.with_columns(
    attack_category = pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%').otherwise('Other')
).head(8)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,attack_category
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""Other"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""Other"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""Other"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""Other"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,"""Other"""
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""Top 25%"""
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0,"""Other"""
8,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0,"""Other"""


In [6]:
# Identify the top 25% of Pokemon based on the attack stats AND the bottom 25%
# Use multiple when then statements

pokemon.with_columns(
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75)).then('Top 25%')
        .when(pl.col('attack') < pl.col('attack').quantile(0.25)).then('Bottom 25%')
        .otherwise('Other')
).head(8)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,attack_category
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""Bottom 25%"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""Other"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""Other"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""Bottom 25%"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,"""Other"""
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""Top 25%"""
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0,"""Bottom 25%"""
8,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0,"""Other"""


#### Convert columns to dummies

In [7]:
# Convert the column type1 to dummies

pokemon.head(8).to_dummies(columns=['type1'])

pokedex_number,name,type1_fire,type1_grass,type1_water,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,u8,u8,u8,str,list[str],i64,i64,i64,i64,i64,i64
1,"""Bulbasaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0
2,"""Ivysaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0
3,"""Venusaur""",0,1,0,"""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0
4,"""Charmander""",1,0,0,,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0
5,"""Charmeleon""",1,0,0,,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0
6,"""Charizard""",1,0,0,"""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0
7,"""Squirtle""",0,0,1,,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0
8,"""Wartortle""",0,0,1,,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0


#### Grouping and aggregating data

In [8]:
# Group by type and calculate median stats
# Group by is run in parallel so the order of the rows can change

pokemon.groupby('type1').agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""flying""",79.0,70.0,80.0,121.0
"""rock""",67.0,89.0,95.0,51.0
"""steel""",60.0,90.0,115.0,50.0


In [9]:
# Group by type and calculate median stats
# Maintaining the order is possible but it is slower

pokemon.groupby('type1', maintain_order=True).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""grass""",65.0,68.0,66.0,55.0
"""fire""",65.0,79.5,61.5,72.5
"""water""",68.0,70.0,70.0,64.0


In [10]:
# Group on multiple columns works the same way

pokemon.groupby(['type1', 'is_legendary']).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median().round(0)
).head(3)

type1,is_legendary,hp,attack,defense,speed
str,i64,f64,f64,f64,f64
"""grass""",0,63.0,66.0,65.0,54.0
"""electric""",0,60.0,62.0,57.0,88.0
"""psychic""",0,65.0,48.0,55.0,63.0


In [11]:
# Lots of aggregations functions are available

pokemon.groupby('type1').agg(
    avg_attack = pl.col('attack').mean(),
    median_attack = pl.col('attack').median(),
    min_attack = pl.col('attack').min(),
    max_attack = pl.col('attack').max(),
    quantile_25 = pl.col('attack').quantile(0.25),
    quantile_75 = pl.col('attack').quantile(0.75),
    count = pl.count(), # Count the number of pokemon in each group
    first_pokemon = pl.first('name') # Get the first pokemon in each group
).head(3)

type1,avg_attack,median_attack,min_attack,max_attack,quantile_25,quantile_75,count,first_pokemon
str,f64,f64,i64,i64,f64,f64,u32,str
"""ghost""",72.740741,66.0,30,165,50.0,92.0,27,"""Gastly"""
"""ice""",73.304348,65.0,30,130,50.0,95.0,23,"""Jynx"""
"""fairy""",62.111111,58.5,20,131,45.0,72.0,18,"""Clefairy"""


In [12]:
# Possible to use Numpy universal functions (ufuncs) in aggregations

import numpy as np

pokemon.groupby('type1').agg(
    sqrt_attack_mean = np.sqrt(pl.col('attack')).mean() # sqrt is the square root function
).head(3)

type1,sqrt_attack_mean
str,f64
"""water""",8.388571
"""dragon""",10.18751
"""ghost""",8.335305


In [13]:
# If aggregation functions are not specified, the result if aggregated in a list

pokemon.groupby('type1').agg(
    pl.col('name')
).head(3)

type1,name
str,list[str]
"""fairy""","[""Clefairy"", ""Clefable"", … ""Comfey""]"
"""dragon""","[""Dratini"", ""Dragonair"", … ""Kommo-o""]"
"""ground""","[""Sandshrew"", ""Sandslash"", … ""Mudsdale""]"


In [14]:
# Concatenate the names of the pokemon into a string

pokemon.groupby('type1').agg(
    pl.col('name').str.concat(', ')
).head(3)

type1,name
str,str
"""fairy""","""Clefairy, Clefable, Cleffa, T…"
"""ground""","""Sandshrew, Sandslash, Diglett…"
"""dragon""","""Dratini, Dragonair, Dragonite…"


In [15]:
# Aggregation with conditions
# Calculate the average attack for Pokemons with a speed above 100, by type

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_100 = pl.col('attack').filter(pl.col('speed') > 100).mean()
).head(3)


type1,avg_attack_when_speed_above_100
str,f64
"""ghost""",62.5
"""ice""",65.0
"""fighting""",120.666667


In [16]:
# We can also create functions to use in aggregations

def avg_attack_when_speed_above_threshold(speed):
    return pl.col('attack').filter(pl.col('speed') > speed).mean().alias(f'avg_attack_when_speed_above_{speed}')

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_threshold(speed) for speed in range(20,100,20)
).head(3)

type1,avg_attack_when_speed_above_20,avg_attack_when_speed_above_40,avg_attack_when_speed_above_60,avg_attack_when_speed_above_80
str,f64,f64,f64,f64
"""ground""",94.645161,97.952381,98.733333,111.333333
"""poison""",72.65625,74.592593,80.375,89.285714
"""normal""",76.66,78.333333,84.859649,92.228571


In [17]:
# Show the top 3 pokemon by attack in each group
# We sort before aggregation to make sure we get the top 3
(
    pokemon
    .sort(by='attack', descending=True)
    .groupby('type1').agg(
        top_3_pokemon_by_attack = pl.col('name').head(3) # head(3) filters top 3 pokemons per type
    )
).head(5) # head(5) filters 5 types

type1,top_3_pokemon_by_attack
str,list[str]
"""ghost""","[""Banette"", ""Dhelmise"", ""Giratina""]"
"""dragon""","[""Rayquaza"", ""Garchomp"", ""Zekrom""]"
"""normal""","[""Slaking"", ""Regigigas"", ""Lopunny""]"
"""flying""","[""Tornadus"", ""Noivern"", ""Noibat""]"
"""fighting""","[""Lucario"", ""Conkeldurr"", ""Crabominable""]"


In [18]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health in each group
(
    pokemon
    .groupby('type1').agg(
        top_3_pokemon_by_attack = pl.col('name').sort_by('attack', descending=True).head(3),
        top_3_pokemon_by_defense = pl.col('name').sort_by('defense', descending=True).head(3) 
    )
).head(5)

type1,top_3_pokemon_by_attack,top_3_pokemon_by_defense
str,list[str],list[str]
"""fairy""","[""Xerneas"", ""Granbull"", ""Snubbull""]","[""Togekiss"", ""Xerneas"", ""Comfey""]"
"""flying""","[""Tornadus"", ""Noivern"", ""Noibat""]","[""Tornadus"", ""Noivern"", ""Noibat""]"
"""water""","[""Gyarados"", ""Swampert"", ""Kyogre""]","[""Slowbro"", ""Cloyster"", ""Carracosta""]"
"""normal""","[""Slaking"", ""Regigigas"", ""Lopunny""]","[""Audino"", ""Arceus"", ""Regigigas""]"
"""fighting""","[""Lucario"", ""Conkeldurr"", ""Crabominable""]","[""Hitmontop"", ""Conkeldurr"", ""Passimian""]"


In [19]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health without grouping by type

pokemon.select(
    top_3_pokemon_by_attack = pl.col('name').sort_by('attack', descending=True).head(3).implode(),
    top_3_pokemon_by_defense = pl.col('name').sort_by('defense', descending=True).head(3).implode()
)

top_3_pokemon_by_attack,top_3_pokemon_by_defense
list[str],list[str]
"[""Heracross"", ""Kartana"", ""Groudon""]","[""Steelix"", ""Shuckle"", ""Aggron""]"


In [20]:
# I can iterate over the groups and do any action on them

for name, data in pokemon.groupby("is_legendary"):
    print(name)
    print(data.head(2))

0
shape: (2, 11)
┌────────────────┬───────────┬───────┬────────┬───┬─────────┬───────┬────────────┬──────────────┐
│ pokedex_number ┆ name      ┆ type1 ┆ type2  ┆ … ┆ defense ┆ speed ┆ generation ┆ is_legendary │
│ ---            ┆ ---       ┆ ---   ┆ ---    ┆   ┆ ---     ┆ ---   ┆ ---        ┆ ---          │
│ i64            ┆ str       ┆ str   ┆ str    ┆   ┆ i64     ┆ i64   ┆ i64        ┆ i64          │
╞════════════════╪═══════════╪═══════╪════════╪═══╪═════════╪═══════╪════════════╪══════════════╡
│ 1              ┆ Bulbasaur ┆ grass ┆ poison ┆ … ┆ 49      ┆ 45    ┆ 1          ┆ 0            │
│ 2              ┆ Ivysaur   ┆ grass ┆ poison ┆ … ┆ 63      ┆ 60    ┆ 1          ┆ 0            │
└────────────────┴───────────┴───────┴────────┴───┴─────────┴───────┴────────────┴──────────────┘
1
shape: (2, 11)
┌────────────────┬──────────┬──────────┬────────┬───┬─────────┬───────┬────────────┬──────────────┐
│ pokedex_number ┆ name     ┆ type1    ┆ type2  ┆ … ┆ defense ┆ speed ┆ generation

#### Window functions

In [21]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1

pokemon.select(
    'name', 'type1', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1')
).head(5)

name,type1,attack,avg_attack_for_same_type1
str,str,i64,f64
"""Bulbasaur""","""grass""",49,73.769231
"""Ivysaur""","""grass""",62,73.769231
"""Venusaur""","""grass""",100,73.769231
"""Charmander""","""fire""",52,81.5
"""Charmeleon""","""fire""",64,81.5


In [22]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1 and pokemons of the same type 2

pokemon.select(
    'name', 'type1', 'type2', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1'),
    avg_attack_for_same_type2 = pl.mean('attack').over('type2')
).head(5)

name,type1,type2,attack,avg_attack_for_same_type1,avg_attack_for_same_type2
str,str,str,i64,f64,f64
"""Bulbasaur""","""grass""","""poison""",49,73.769231,67.617647
"""Ivysaur""","""grass""","""poison""",62,73.769231,67.617647
"""Venusaur""","""grass""","""poison""",100,73.769231,67.617647
"""Charmander""","""fire""",,52,81.5,74.231771
"""Charmeleon""","""fire""",,64,81.5,74.231771


In [23]:
# For each pokemon, show the top 3 pokemon by attack of the same type

pokemon.select(
    'name', 'type1', 'attack',
    top_3_attack_pokemon_same_type = pl.col('name').sort_by('attack', descending=True).head(3).implode().over('type1')
).head(5)

name,type1,attack,top_3_attack_pokemon_same_type
str,str,i64,list[str]
"""Bulbasaur""","""grass""",49,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Ivysaur""","""grass""",62,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Venusaur""","""grass""",100,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Charmander""","""fire""",52,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"
"""Charmeleon""","""fire""",64,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"


In [24]:
# Show top 3 pokemon by attack and defense

pokemon.select(
    pl.col('type1').head(3).implode().over('type1').flatten(),
    # flatten changes 2D lists (each row has a list of 3 pokemons) to 1D array (each pokemon in a separate row)
    top_3_attack = pl.col('name').sort_by('attack', descending=True).head(3).implode().over('type1').flatten(), 
    top_3_defense = pl.col('name').sort_by('defense', descending=True).head(3).implode().over('type1').flatten()
).head(6)

type1,top_3_attack,top_3_defense
str,str,str
"""grass""","""Kartana""","""Ferrothorn"""
"""grass""","""Abomasnow""","""Kartana"""
"""grass""","""Breloom""","""Leafeon"""
"""fire""","""Blaziken""","""Torkoal"""
"""fire""","""Flareon""","""Turtonator"""
"""fire""","""Ho-Oh""","""Magcargo"""


In [25]:
# Rank the pokemon by attack and defense by type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    attack_rank = pl.col('attack').rank(method='ordinal', descending=True).over('type1'),
    defense_rank = pl.col('defense').rank(method='ordinal', descending=True).over('type1')
).head(6)

name,type1,attack,defense,attack_rank,defense_rank
str,str,i64,i64,u32,u32
"""Bulbasaur""","""grass""",49,49,61,62
"""Ivysaur""","""grass""",62,63,45,45
"""Venusaur""","""grass""",100,123,16,5
"""Charmander""","""fire""",52,43,45,44
"""Charmeleon""","""fire""",64,58,37,29
"""Charizard""","""fire""",104,78,10,14


#### Range function

In [26]:
# Add an increasing index for the pokemon of each type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    type_index = pl.arange(1, pl.count()+1).over('type1')
).head(6)

name,type1,attack,defense,type_index
str,str,i64,i64,i64
"""Bulbasaur""","""grass""",49,49,1
"""Ivysaur""","""grass""",62,63,2
"""Venusaur""","""grass""",100,123,3
"""Charmander""","""fire""",52,43,1
"""Charmeleon""","""fire""",64,58,2
"""Charizard""","""fire""",104,78,3


In [27]:
# Take a random sample of 3 pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count()+1).shuffle().over("type1") <= 3
).sort(by='type1').head(6)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
401,"""Kricketot""","""bug""",,"[""Shed Skin"", ""Run Away""]",37,25,41,25,4,0
664,"""Scatterbug""","""bug""",,"[""Shield Dust"", ""Compoundeyes"", ""Friend Guard""]",38,35,40,35,6,0
742,"""Cutiefly""","""bug""","""fairy""","[""Honey Gather"", ""Shield Dust"", ""Sweet Veil""]",40,45,40,84,7,0
197,"""Umbreon""","""dark""",,"[""Synchronize"", ""Inner Focus""]",95,65,110,65,2,0
228,"""Houndour""","""dark""","""fire""","[""Early Bird"", ""Flash Fire"", ""Unnerve""]",45,60,30,65,2,0
634,"""Zweilous""","""dark""","""dragon""","[""Hustle""]",72,85,70,58,5,0


In [28]:
# Take a random sample of 5% of pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).shuffle().over("type1") <= pl.count().over("type1") * 0.05
).sort(by="type1").head(6)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
15,"""Beedrill""","""bug""","""poison""","[""Swarm"", ""Sniper""]",65,150,40,145,1,0
205,"""Forretress""","""bug""","""steel""","[""Sturdy"", ""Overcoat""]",75,90,140,40,2,0
541,"""Swadloon""","""bug""","""grass""","[""Leaf Guard"", ""Chlorophyll"", ""Overcoat""]",55,63,90,42,5,0
261,"""Poochyena""","""dark""",,"[""Run Away"", ""Quick Feet"", ""Rattled""]",35,55,35,35,3,0
372,"""Shelgon""","""dragon""",,"[""Rock Head"", ""Overcoat""]",65,95,100,50,3,0
181,"""Ampharos""","""electric""",,"[""Static"", ""Plus""]",90,95,105,45,2,0


In [29]:
# Get the 5th pokemon for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).over("type1") == 5
).head(5)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
14,"""Kakuna""","""bug""","""poison""","[""Shed Skin""]",45,25,50,35,1,0
20,"""Raticate""","""normal""","""dark""","[""Run Away"", ""Guts"", … ""Thick Fat""]",75,71,70,77,1,0
31,"""Nidoqueen""","""poison""","""ground""","[""Poison Point"", ""Rivalry"", ""Sheer Force""]",90,92,87,76,1,0
38,"""Ninetales""","""fire""","""ice""","[""Flash Fire"", ""Drought"", … ""Snow Warning""]",73,67,75,109,1,0
44,"""Gloom""","""grass""","""poison""","[""Chlorophyll"", ""Stench""]",60,65,70,40,1,0


#### Binning functions: Cut and Qcut

In [30]:
# Bin the attack column into 3 bins
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.get_column('attack').cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True).head(5)

attack,break_point,category
f64,f64,cat
49.0,50.0,"""weak"""
62.0,100.0,"""medium"""
100.0,100.0,"""medium"""
52.0,100.0,"""medium"""
64.0,100.0,"""medium"""


In [31]:
# Assigns the pokemon a category based on their attack stats:
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.with_columns(
    'name', 'type1', 'attack',
    pokemon.get_column('attack').cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True).get_column('category')
).head(6)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,category
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,cat
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""weak"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""medium"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""medium"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""medium"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,"""medium"""
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""strong"""


In [32]:
# Assigns the pokemon a category based on their attack stats using percentiles:
# bottom 40% - weak, top 20% - strong, rest - medium

pokemon.with_columns(
    'name', 'type1', 'attack',
    pokemon.get_column('attack').qcut(quantiles=[0.4,0.8], labels=['weak', 'medium', 'strong'], maintain_order=True).get_column('category')
).head(6)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,category
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,cat
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""weak"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""weak"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""medium"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""weak"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,"""weak"""
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""medium"""


#### Repeat

In [33]:
# Use a different dataframe for this example
# Based on a dataframe with 3 fruits and their order quantities
# Repeat the fruit names based on their order quantities

df = pl.DataFrame(
    {
        "Fruit": ["Apple", "Banana", "Cherry"],
        "Order_Quantity": [1, 3, 2],
    }
)

df.select(
    pl.col('Fruit').repeat_by('Order_Quantity').flatten()
    )

Fruit
str
"""Apple"""
"""Banana"""
"""Banana"""
"""Banana"""
"""Cherry"""
"""Cherry"""


#### Explode, Implode and Flatten

In [34]:
# Explode the list of pokemon abilities: from a column of list to a column of values
# Point of attention when using explode: explode increases the number of rows
# All columns in a Polars dataframe should have the same number of rows

pokemon.head(3).select(
    pl.col('abilities').explode()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


In [35]:
# Implode is the opposite of explode, and aggregates values into lists

pokemon.head(3).select(
    pl.all().implode(),
)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
list[i64],list[str],list[str],list[str],list[list[str]],list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
"[1, 2, 3]","[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]","[""grass"", ""grass"", ""grass""]","[""poison"", ""poison"", ""poison""]","[[""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""]]","[45, 60, 80]","[49, 62, 100]","[49, 63, 123]","[45, 60, 80]","[1, 1, 1]","[0, 0, 0]"


In [36]:
# Flatten removes a level of nesting from a column of lists
# If there is no nesting, then it acts as explodes

pokemon.head(3).select(
    pl.all().implode().flatten()
)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0


In [37]:
# When there is no nesting, flatten acts as explode

pokemon.head(3).select(
    pl.col('abilities').flatten()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


#### Conversion between structs and lists

In [38]:
# Combine type 1 and type 2 in a new struct column

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).head(5)

name,types
str,struct[2]
"""Bulbasaur""","{""grass"",""poison""}"
"""Ivysaur""","{""grass"",""poison""}"
"""Venusaur""","{""grass"",""poison""}"
"""Charmander""","{""fire"",null}"
"""Charmeleon""","{""fire"",null}"


In [39]:
# The struct column retains the type information of each initial column:
# Struct are views on the initial columns so the data is not copied
# The table schema shows that the names of the initial columns are retained 

pokemon.select(
    'name',
    types = pl.struct('type1', 'type2')
).schema

{'name': Utf8, 'types': Struct([Field('type1', Utf8), Field('type2', Utf8)])}

In [40]:
# structs are important because they allow us to do calculations on combined columns
# We can calculate the how many times each combination of types occurs

pokemon.select(
    types_count = pl.struct('type1', 'type2').value_counts()
).head(5)

types_count
struct[2]
"{{""grass"",""poison""},14}"
"{{""fire"",null},27}"
"{{""fire"",""flying""},6}"
"{{""water"",null},61}"
"{{""bug"",null},18}"


In [41]:
# Combine type 1 and type 2 in a new list column

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).head(5)

name,types
str,list[str]
"""Bulbasaur""","[""grass"", ""poison""]"
"""Ivysaur""","[""grass"", ""poison""]"
"""Venusaur""","[""grass"", ""poison""]"
"""Charmander""","[""fire"", null]"
"""Charmeleon""","[""fire"", null]"


In [42]:
# List columns do not retain the name of the initial columns
# In List columns, each row can have a different number of elements differently from Struct columns
# where each row has the same number of elements

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).schema

{'name': Utf8, 'types': List(Utf8)}

In [43]:
# List columns can be converted to a struct
# If number of elements are not the same, then the missing values are filled with null

pokemon.select(
    pl.col('abilities').arr.to_struct(n_field_strategy = 'max_width')
).head(5)

abilities
struct[6]
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"


In [44]:
# By default, the fields inside of the struct are called field_0, field_1, etc.

pokemon.select(
    pl.col('abilities').arr.to_struct(n_field_strategy = 'max_width')
).schema

{'abilities': Struct([Field('field_0', Utf8), Field('field_1', Utf8), Field('field_2', Utf8), Field('field_3', Utf8), Field('field_4', Utf8), Field('field_5', Utf8)])}

In [45]:
# It's possible to change the name with the argument fields, for example calling them ability_0, ability_1, etc.

pokemon.select(
    pl.col('abilities')
    .arr.to_struct(n_field_strategy = 'max_width', fields = lambda i: f'ability_{i}')
).schema

{'abilities': Struct([Field('ability_0', Utf8), Field('ability_1', Utf8), Field('ability_2', Utf8), Field('ability_3', Utf8), Field('ability_4', Utf8), Field('ability_5', Utf8)])}

In [46]:
# We can un-nest struct columns and expand them into multiple columns

pokemon.select(
    pl.col('abilities')
    .arr.to_struct(n_field_strategy = 'max_width')
).unnest('abilities').head(5)

field_0,field_1,field_2,field_3,field_4,field_5
str,str,str,str,str,str
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Blaze""","""Solar Power""",,,,
"""Blaze""","""Solar Power""",,,,


#### Pivot, Melt, Unstack and Transpose

In [47]:
# Pivot type 2 to the columns and count the number of pokemons for each type 1 and type 2 combination

pokemon.pivot(
    values = 'name',
    index = 'type1',
    columns = 'type2',
    aggregate_function=pl.element().count()
)


type1,poison,null,flying,dark,electric,ice,ground,fairy,grass,fighting,psychic,steel,fire,rock,water,dragon,ghost,bug,normal
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""grass""",14.0,37,6.0,3.0,,2.0,1.0,5.0,1.0,3.0,2.0,3.0,,,,,1.0,,
"""fire""",,27,6.0,1.0,,2.0,2.0,,,6.0,1.0,1.0,1.0,1.0,1.0,1.0,,,2.0
"""water""",3.0,61,7.0,4.0,2.0,3.0,9.0,4.0,3.0,2.0,5.0,1.0,,4.0,,2.0,2.0,2.0,
"""bug""",11.0,18,13.0,,4.0,,1.0,2.0,6.0,3.0,,5.0,2.0,3.0,3.0,,1.0,,
"""normal""",,61,26.0,4.0,,,1.0,4.0,2.0,2.0,3.0,,,,1.0,1.0,,,
"""poison""",2.0,13,3.0,3.0,,,2.0,,,2.0,,,2.0,,3.0,1.0,,1.0,
"""electric""",,26,3.0,,1.0,,,2.0,,,,4.0,,,,,1.0,,2.0
"""ground""",,10,3.0,3.0,1.0,2.0,2.0,,,,2.0,1.0,1.0,3.0,,2.0,2.0,,
"""fairy""",,16,2.0,,,,,,,,,,,,,,,,
"""fighting""",,22,1.0,1.0,,1.0,,,,,2.0,1.0,,,,,,,


In [48]:
# Pivot does not exist in lazy mode, however it's possible to reproduce it with group by
# In this case, we need to specify the resulting columns in the code

pokemon.lazy().groupby('type1').agg(
    pl.col('name').filter(pl.col('type2') == pokemon_type).count().alias(pokemon_type)
      for pokemon_type in ['poison', 'flying', 'dark', 'electric']
).collect()

type1,poison,flying,dark,electric
str,u32,u32,u32,u32
"""psychic""",0,6,0,0
"""fire""",0,6,1,0
"""fighting""",0,1,1,0
"""electric""",0,3,0,1
"""water""",3,7,4,2
"""ground""",0,3,3,1
"""flying""",0,0,0,0
"""bug""",11,13,0,4
"""fairy""",0,2,0,0
"""dark""",0,5,0,0


In [49]:
# Melt is the opposite of pivot, it brings the header of multiple columns into one column
# and their value in another column

pokemon.melt(
    id_vars = ['name', 'type1', 'type2'],
    value_vars = ['hp','attack', 'defense']
).sort(by='name').head(6)

name,type1,type2,variable,value
str,str,str,str,i64
"""Abomasnow""","""grass""","""ice""","""hp""",90
"""Abomasnow""","""grass""","""ice""","""attack""",132
"""Abomasnow""","""grass""","""ice""","""defense""",105
"""Abra""","""psychic""",,"""hp""",25
"""Abra""","""psychic""",,"""attack""",20
"""Abra""","""psychic""",,"""defense""",15


In [50]:
# Unstack breaks the dataframe into multiple groups of the same size 
# and moves these groups to new columns
# Here we split the dataframe into groups of 3, and add a column with the level of the pokemon

(
    pokemon
    .head(9)
    .unstack(columns = ['name', 'attack', 'defense'], step=3, how="vertical")
    .with_columns(
        level = pl.Series(range(1,4))
        )
)

name_0,name_1,name_2,attack_0,attack_1,attack_2,defense_0,defense_1,defense_2,level
str,str,str,i64,i64,i64,i64,i64,i64,i64
"""Bulbasaur""","""Charmander""","""Squirtle""",49,52,48,49,43,65,1
"""Ivysaur""","""Charmeleon""","""Wartortle""",62,64,63,63,58,80,2
"""Venusaur""","""Charizard""","""Blastoise""",100,104,103,123,78,120,3


In [51]:
# Transpose inverses the rows and columns of a dataframe
# It's a computationally expensive operation, so it should be used only if no other option is available

pokemon.head(3).select(
    'name',
    'type1',
    'type2'
).transpose()

column_0,column_1,column_2
str,str,str
"""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""grass""","""grass""","""grass"""
"""poison""","""poison""","""poison"""


#### Merge DataFrames: hstack, vstack, extend, concat, join

In [52]:
# Horizontally stack 2 dataframes
# We have a new dataframe with the pokemon color

pokemon_color = pl.DataFrame({
    'color': ['green', 'green', 'green', 'red', 'red', 'red']
})

pokemon.head(6).hstack(pokemon_color)

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,color
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""green"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""green"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""green"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""red"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,"""red"""
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,"""red"""


In [53]:
# Vertically stack 2 dataframes
# We have a new dataframe with a new pokemon

new_pokemon = pl.DataFrame({
    'name': ['Polarizard'],
    'type1': ['ice'],
    'type2': ['flying'],
    'abilities': [['snow warning', 'blaze']]
})

pokemon.select('name','type1','type2','abilities').vstack(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [54]:
# Extend is similar to vstack, but it copies the data instead of referencing it

pokemon.select('name','type1','type2','abilities').extend(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [55]:
# concat can reproduce both hstack and vstack when rechunk is set to False
# when rechunk is set to True, all data is copied to a contiguous memory space which allows it to be faster

pl.concat(
    [pokemon.select('name','type1','type2','abilities'), new_pokemon],
    rechunk = True,
    how= 'vertical'
).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [56]:
# concat diagonal stacks dataframes diagonally
# this means that columns missing from one dataframe are filled with nulls

new_pokemon_color = pl.DataFrame({
    'name': ['Polarizard'],
    'abilities': [['snow warning', 'blaze']],
    'color': ['white']
})

pl.concat([
    pokemon.select('name','type1','type2','abilities'),
    new_pokemon_color
    ],
    how = 'diagonal'
).tail(5)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Kartana""","""grass""","""steel""","[""Beast Boost""]",
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]",
"""Necrozma""","""psychic""",,"[""Prism Armor""]",
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]",
"""Polarizard""",,,"[""snow warning"", ""blaze""]","""white"""


In [57]:
# concat has another method, which is align.
# the align method ensures that the columns of the dataframes are in the same order

pokemon_new_order = pl.DataFrame({
    'name' : ['Squirtle', 'Charmander', 'Ivysaur'],
    'color': ['blue', 'red', 'green']
})

pl.concat([
    pokemon.select('name','type1',).head(9),
    pokemon_new_order
], how='align'
)

name,type1,color
str,str,str
"""Blastoise""","""water""",
"""Bulbasaur""","""grass""",
"""Charizard""","""fire""",
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""",
"""Ivysaur""","""grass""","""green"""
"""Squirtle""","""water""","""blue"""
"""Venusaur""","""grass""",
"""Wartortle""","""water""",


In [68]:
# join reproduces SQL joins, such as inner, left, outer, semi, anti, cross
# inner keeps only rows which are in both dataframes 

pokemon_names_colors = pl.DataFrame({
    'name' : ['Squirtle', 'Charmander', 'Bulbasaur', 'Polarizard'],
    'color': ['blue', 'red', 'green', 'white']
})

pokemon.head(9).join(pokemon_names_colors, on = 'name', how = 'inner')

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,color
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""green"""
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""red"""
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0,"""blue"""


In [69]:
# left keeps rows in the left dataframe, and adds information from the right dataframe
# notice that Polarizard is not included
# FYI: Polars does not have a 'right' join. Right join is a left join with the dataframes switched

pokemon.head(9).join(pokemon_names_colors, on = 'name', how = 'left') 

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,color
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""green"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0,"""red"""
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0,
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0,
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0,"""blue"""
8,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0,
9,"""Blastoise""","""water""",,"[""Torrent"", ""Rain Dish""]",79,103,120,78,1,0,


In [70]:
# outer keeps rows from both dataframes, and fills missing values with nulls

pokemon.head(9).join(pokemon_names_colors, on = 'name', how = 'outer')

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,color
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1.0,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45.0,49.0,49.0,45.0,1.0,0.0,"""green"""
2.0,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60.0,62.0,63.0,60.0,1.0,0.0,
3.0,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80.0,100.0,123.0,80.0,1.0,0.0,
4.0,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39.0,52.0,43.0,65.0,1.0,0.0,"""red"""
5.0,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58.0,64.0,58.0,80.0,1.0,0.0,
6.0,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78.0,104.0,78.0,100.0,1.0,0.0,
7.0,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44.0,48.0,65.0,43.0,1.0,0.0,"""blue"""
8.0,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59.0,63.0,80.0,58.0,1.0,0.0,
9.0,"""Blastoise""","""water""",,"[""Torrent"", ""Rain Dish""]",79.0,103.0,120.0,78.0,1.0,0.0,
,"""Polarizard""",,,,,,,,,,"""white"""


In [71]:
# the semi join uses the right table to decide which rows to keep from the left table
# it's similar to the inner join, with the difference that no columns are added from the right table

pokemon.head(9).join(pokemon_names_colors, on = 'name', how = 'semi')

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0
4,"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]",39,52,43,65,1,0
7,"""Squirtle""","""water""",,"[""Torrent"", ""Rain Dish""]",44,48,65,43,1,0


In [73]:
# the anti joins keeps rows in the left table which are not in the right table

pokemon.head(9).join(pokemon_names_colors, on = 'name', how = 'anti')

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0
5,"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]",58,64,58,80,1,0
6,"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]",78,104,78,100,1,0
8,"""Wartortle""","""water""",,"[""Torrent"", ""Rain Dish""]",59,63,80,58,1,0
9,"""Blastoise""","""water""",,"[""Torrent"", ""Rain Dish""]",79,103,120,78,1,0


In [77]:
# the cross join creates all possible combinations of rows from both tables

pokemon_trainer = pl.DataFrame({'trainer': ['trainer1', 'trainer2']})

pokemon.head(3).join(pokemon_trainer, how = 'cross')

pokedex_number,name,type1,type2,abilities,hp,attack,defense,speed,generation,is_legendary,trainer
i64,str,str,str,list[str],i64,i64,i64,i64,i64,i64,str
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""trainer1"""
1,"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,1,0,"""trainer2"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""trainer1"""
2,"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,1,0,"""trainer2"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""trainer1"""
3,"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,1,0,"""trainer2"""


In [81]:
# we can join a dataframe with itself
# here we find the pokemon evolutions
# the same approach can be used in supply chain to find the supplier of each product

pokemon_with_evolution = pl.DataFrame({
    'name' : ['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charmander', 'Charmeleon', 'Charizard', 'Squirtle', 'Wartortle', 'Blastoise'],
    'evolution': ['Ivysaur', 'Venusaur', None, 'Charmeleon', 'Charizard', None, 'Wartortle', 'Blastoise', None]
})

pokemon_with_evolution

name,evolution
str,str
"""Bulbasaur""","""Ivysaur"""
"""Ivysaur""","""Venusaur"""
"""Venusaur""",
"""Charmander""","""Charmeleon"""
"""Charmeleon""","""Charizard"""
"""Charizard""",
"""Squirtle""","""Wartortle"""
"""Wartortle""","""Blastoise"""
"""Blastoise""",


In [85]:
pokemon_with_evolution.join(
    pokemon_with_evolution, 
    left_on='evolution', 
    right_on='name', 
    how='left', 
    suffix='_2'
    )

name,evolution,evolution_2
str,str,str
"""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""Ivysaur""","""Venusaur""",
"""Venusaur""",,
"""Charmander""","""Charmeleon""","""Charizard"""
"""Charmeleon""","""Charizard""",
"""Charizard""",,
"""Squirtle""","""Wartortle""","""Blastoise"""
"""Wartortle""","""Blastoise""",
"""Blastoise""",,


#### Improve readability: use pipes

In [101]:
# We can define functions to be applied to a DataFrame
# then apply them to DataFrames in an easy way
# and avoid repetition of code
# pipe also works for expressions

def keep_legendary(pokemon):
    return pokemon.filter(pl.col('is_legendary') == 1)

def select_main_columns(pokemon):
    return pokemon.select('name', 'type1', 'type2', 'hp', 'attack', 'defense')

def add_attack_rank_and_filter_top_10(pokemon):
    pokemon_with_attack_rank = pokemon.with_column('attack_rank', pl.col('attack').rank(descending=True, method='ordinal'))
    pokemon_top_10_attack = pokemon_with_attack_rank.filter(pl.col('attack_rank') <= 10)

    return pokemon_top_10_attack

In [102]:
(
    pokemon
    .pipe(keep_legendary)
    .pipe(select_main_columns)
    .pipe(add_attack_rank_and_filter_top_10)
)

name,type1,type2,hp,attack,defense,attack_rank
str,str,str,i64,i64,i64,u32
"""Mewtwo""","""psychic""",,106,150,70,7
"""Kyogre""","""water""",,100,150,90,8
"""Groudon""","""ground""",,100,180,160,2
"""Rayquaza""","""dragon""","""flying""",105,180,100,3
"""Regigigas""","""normal""",,110,160,110,4
"""Zekrom""","""dragon""","""electric""",100,150,120,9
"""Landorus""","""ground""","""flying""",89,145,90,10
"""Diancie""","""rock""","""fairy""",50,160,110,5
"""Hoopa""","""psychic""","""ghost""",80,160,60,6
"""Kartana""","""grass""","""steel""",59,181,131,1


#### Use custom functions: map and apply

In [None]:
# it's possible to run custom Python functions on rows or columns of a Polars DataFrame
# the map function applies a function to a column
# the apply function applies a function to a row

# look at Polars guide, talk about pl.struct, talk about lru.cache