### Advanced Data Transformation with Polars

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code.

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [2]:
import polars as pl

# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [3]:
pokemon = pl.read_parquet('../datasets/pokemon_simplified.parquet')

In [4]:
pokemon.head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


#### Conditions

In [5]:
# Identify the top 25% of Pokemon based on the attack stats
# Use the quantile function to get the 75th percentile

pokemon.select(
    'name', 'attack',
    attack_category = 
    pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
    .then('Top 25%')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,
"""Ivysaur""",62,
"""Venusaur""",100,
"""Charmander""",52,
"""Charmeleon""",64,
"""Charizard""",104,"""Top 25%"""


In [6]:
# Identify the top 25% of Pokemon based on the attack stats
# Use otherwise to assign the remaining 75% of Pokemon to the 'Other' category

pokemon.select(
    'name', 'attack',
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
        .then('Top 25%')
        .otherwise('Other')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Other"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Other"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [7]:
# Identify the top 25% of Pokemon based on the attack stats AND the bottom 25%
# Use multiple when then statements

pokemon.select(
    'name', 'attack',
    attack_category = 
        pl.when(pl.col('attack') > pl.col('attack').quantile(0.75))
        .then('Top 25%')
        .when(pl.col('attack') < pl.col('attack').quantile(0.25))
        .then('Bottom 25%')
        .otherwise('Other')
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Bottom 25%"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Bottom 25%"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [8]:
# We can use map_dict instead of pl.when().then() to map colors to Pokemon types

type_colors = {'grass': 'green', 'fire': 'red', 'water': 'blue'}

pokemon.select(
    'name', 'type1',
    color = pl.col('type1').map_dict(type_colors)
).head(6)

name,type1,color
str,str,str
"""Bulbasaur""","""grass""","""green"""
"""Ivysaur""","""grass""","""green"""
"""Venusaur""","""grass""","""green"""
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""","""red"""
"""Charizard""","""fire""","""red"""


#### One hot encoding

In [9]:
# Convert the column type1 to dummies

pokemon.head(6).select('name','type1').to_dummies(columns=['type1'])

name,type1_fire,type1_grass
str,u8,u8
"""Bulbasaur""",0,1
"""Ivysaur""",0,1
"""Venusaur""",0,1
"""Charmander""",1,0
"""Charmeleon""",1,0
"""Charizard""",1,0


#### Aggregating data

In [10]:
# Group by type and calculate median stats
# Group by is run in parallel so the order of the rows can change

pokemon.groupby('type1').agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""ice""",70.0,65.0,70.0,59.0
"""poison""",65.0,68.5,66.0,62.5
"""fire""",65.0,79.5,61.5,72.5


In [11]:
# Group by type and calculate median stats
# Maintaining the order is possible but it is slower

pokemon.groupby('type1', maintain_order=True).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""grass""",65.0,68.0,66.0,55.0
"""fire""",65.0,79.5,61.5,72.5
"""water""",68.0,70.0,70.0,64.0


In [12]:
# Group on multiple columns works the same way

pokemon.groupby(['type1', 'type2']).agg(
    pl.col('hp', 'attack', 'defense', 'speed').median()
).head(3)

type1,type2,hp,attack,defense,speed
str,str,f64,f64,f64,f64
"""electric""",,60.0,64.0,56.0,86.5
"""fairy""",,76.0,65.0,66.5,47.0
"""ground""","""ground""",22.5,77.5,45.0,100.0


In [13]:
# Lots of aggregations functions are available

pokemon.groupby('type1').agg(
    avg_attack = pl.col('attack').mean(),
    max_attack = pl.col('attack').max(),
    quantile_75 = pl.col('attack').quantile(0.75),
    count = pl.count(), 
    first_pokemon = pl.first('name'), 
    strongest_pokemon = pl.col('name').sort_by('attack', descending=True).first() # 
).head(3)

type1,avg_attack,max_attack,quantile_75,count,first_pokemon,strongest_pokemon
str,f64,i64,f64,u32,str,str
"""normal""",75.161905,160,91.0,105,"""Pidgey""","""Slaking"""
"""dark""",87.793103,150,101.0,29,"""Umbreon""","""Absol"""
"""steel""",93.083333,150,110.0,24,"""Steelix""","""Aegislash"""


In [14]:
# Possible to use Numpy universal functions (ufuncs) in aggregations

import numpy as np

pokemon.groupby('type1').agg(
    sqrt_attack_mean = np.sqrt(pl.col('attack')).mean() # sqrt is the square root function
).head(3)

type1,sqrt_attack_mean
str,f64
"""ice""",8.404404
"""bug""",8.086296
"""ghost""",8.335305


In [15]:
# If aggregation functions are not specified, the result if aggregated in a list

pokemon.groupby('type1').agg(
    pl.col('name').head(3)
).head(3)

type1,name
str,list[str]
"""psychic""","[""Abra"", ""Kadabra"", ""Alakazam""]"
"""fire""","[""Charmander"", ""Charmeleon"", ""Charizard""]"
"""ghost""","[""Gastly"", ""Haunter"", ""Gengar""]"


In [16]:
# Concatenate the names of the pokemon into a string

pokemon.groupby('type1').agg(
    pl.col('name').head(3).str.concat(', ')
).head(3)

type1,name
str,str
"""fire""","""Charmander, Charmeleon, Chari…"
"""water""","""Squirtle, Wartortle, Blastois…"
"""normal""","""Pidgey, Pidgeotto, Pidgeot"""


In [17]:
# Aggregation with conditions
# Calculate the average attack for Pokemons with a speed above 100, by type

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_100 = 
        pl.col('attack').filter(pl.col('speed') > 100).mean(),
    avg_attack_when_speed_below_100 = 
        pl.col('attack').filter(pl.col('speed') < 100).mean()
).head(3)


type1,avg_attack_when_speed_above_100,avg_attack_when_speed_below_100
str,f64,f64
"""dark""",105.428571,82.181818
"""steel""",117.5,90.428571
"""ice""",65.0,71.8


In [18]:
# We can also create functions to use in aggregations

def avg_attack_when_speed_above_threshold(speed):
    result = (
        pl.col('attack')
        .filter(pl.col('speed') > speed)
        .mean()
        .alias(f'avg_attack_when_speed_above_{speed}'))
    return result

pokemon.groupby('type1').agg(
    avg_attack_when_speed_above_threshold(speed) for speed in range(30,91,30)
).head(3)

type1,avg_attack_when_speed_above_30,avg_attack_when_speed_above_60,avg_attack_when_speed_above_90
str,f64,f64,f64
"""ghost""",80.428571,75.454545,66.25
"""electric""",70.820513,72.214286,74.5
"""fire""",82.145833,85.243243,94.642857


In [19]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health in each group
(
    pokemon.groupby('type1').agg(
        top_3_pokemon_by_attack = 
            pl.col('name').sort_by('attack', descending=True).head(3),
        top_3_pokemon_by_defense = 
            pl.col('name').sort_by('defense', descending=True).head(3) 
    )
).head(5)

type1,top_3_pokemon_by_attack,top_3_pokemon_by_defense
str,list[str],list[str]
"""rock""","[""Rampardos"", ""Tyranitar"", ""Diancie""]","[""Regirock"", ""Bastiodon"", ""Onix""]"
"""poison""","[""Toxicroak"", ""Muk"", ""Nidoking""]","[""Toxapex"", ""Weezing"", ""Drapion""]"
"""ghost""","[""Banette"", ""Dhelmise"", ""Giratina""]","[""Cofagrigus"", ""Dusknoir"", ""Dusclops""]"
"""normal""","[""Slaking"", ""Regigigas"", ""Lopunny""]","[""Audino"", ""Arceus"", ""Regigigas""]"
"""fire""","[""Blaziken"", ""Flareon"", ""Ho-Oh""]","[""Torkoal"", ""Turtonator"", ""Magcargo""]"


In [20]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health without grouping by type

pokemon.select(
    top_3_pokemon_by_attack = 
        pl.col('name').sort_by('attack', descending=True).head(3).implode(),
    top_3_pokemon_by_defense = 
        pl.col('name').sort_by('defense', descending=True).head(3).implode()
)

top_3_pokemon_by_attack,top_3_pokemon_by_defense
list[str],list[str]
"[""Heracross"", ""Kartana"", ""Groudon""]","[""Steelix"", ""Shuckle"", ""Aggron""]"


In [21]:
# I can iterate over the groups and do any action on them

for name, data in pokemon.groupby("is_legendary"):
    print(name)
    print(data.head(2))

0
shape: (2, 9)
┌───────────┬───────┬────────┬────────────────┬───┬────────┬─────────┬───────┬──────────────┐
│ name      ┆ type1 ┆ type2  ┆ abilities      ┆ … ┆ attack ┆ defense ┆ speed ┆ is_legendary │
│ ---       ┆ ---   ┆ ---    ┆ ---            ┆   ┆ ---    ┆ ---     ┆ ---   ┆ ---          │
│ str       ┆ str   ┆ str    ┆ list[str]      ┆   ┆ i64    ┆ i64     ┆ i64   ┆ i64          │
╞═══════════╪═══════╪════════╪════════════════╪═══╪════════╪═════════╪═══════╪══════════════╡
│ Bulbasaur ┆ grass ┆ poison ┆ ["Overgrow",   ┆ … ┆ 49     ┆ 49      ┆ 45    ┆ 0            │
│           ┆       ┆        ┆ "Chlorophyll"] ┆   ┆        ┆         ┆       ┆              │
│ Ivysaur   ┆ grass ┆ poison ┆ ["Overgrow",   ┆ … ┆ 62     ┆ 63      ┆ 60    ┆ 0            │
│           ┆       ┆        ┆ "Chlorophyll"] ┆   ┆        ┆         ┆       ┆              │
└───────────┴───────┴────────┴────────────────┴───┴────────┴─────────┴───────┴──────────────┘
1
shape: (2, 9)
┌──────────┬──────────┬─────

#### Window functions

In [22]:
# For each pokemon show its attack, the average attack for pokemons of the same type 1
# and the percentage difference between the two

pokemon.select(
    'name', 'type1', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1').round(1),
    pct_difference_vs_avg = 
        (((pl.col('attack') / pl.mean('attack')).over('type1') -1) * 100).round(1)
).head(5)

name,type1,attack,avg_attack_for_same_type1,pct_difference_vs_avg
str,str,i64,f64,f64
"""Bulbasaur""","""grass""",49,73.8,-33.6
"""Ivysaur""","""grass""",62,73.8,-16.0
"""Venusaur""","""grass""",100,73.8,35.6
"""Charmander""","""fire""",52,81.5,-36.2
"""Charmeleon""","""fire""",64,81.5,-21.5


In [23]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1 and pokemons of the same type 2

pokemon.select(
    'name', 'type1', 'type2', 'attack',
    avg_attack_for_same_type1 = pl.mean('attack').over('type1'),
    avg_attack_for_same_type2 = pl.mean('attack').over('type2')
).head(5)

name,type1,type2,attack,avg_attack_for_same_type1,avg_attack_for_same_type2
str,str,str,i64,f64,f64
"""Bulbasaur""","""grass""","""poison""",49,73.769231,67.617647
"""Ivysaur""","""grass""","""poison""",62,73.769231,67.617647
"""Venusaur""","""grass""","""poison""",100,73.769231,67.617647
"""Charmander""","""fire""",,52,81.5,74.231771
"""Charmeleon""","""fire""",,64,81.5,74.231771


In [24]:
# For each pokemon, show the top 3 pokemon by attack of the same type

pokemon.select(
    'name', 'type1', 'attack',
    top_3_attack_pokemon_same_type = 
        pl.col('name').sort_by('attack', descending=True).head(3)
        .over('type1', mapping_strategy='join')
).head(5)

name,type1,attack,top_3_attack_pokemon_same_type
str,str,i64,list[str]
"""Bulbasaur""","""grass""",49,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Ivysaur""","""grass""",62,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Venusaur""","""grass""",100,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Charmander""","""fire""",52,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"
"""Charmeleon""","""fire""",64,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"


In [25]:
# Show top 3 pokemon by attack and defense

pokemon.select(
    pl.col('type1').head(3).over('type1', mapping_strategy='explode'),
    top_3_attack = 
        pl.col('name').sort_by('attack', descending=True)
        .head(3).over('type1', mapping_strategy='explode'), 
    top_3_defense = 
        pl.col('name').sort_by('defense', descending=True)
        .head(3).over('type1', mapping_strategy='explode')
).head(6)

type1,top_3_attack,top_3_defense
str,str,str
"""grass""","""Kartana""","""Ferrothorn"""
"""grass""","""Abomasnow""","""Kartana"""
"""grass""","""Breloom""","""Leafeon"""
"""fire""","""Blaziken""","""Torkoal"""
"""fire""","""Flareon""","""Turtonator"""
"""fire""","""Ho-Oh""","""Magcargo"""


In [26]:
# Rank the pokemon by attack and defense by type

pokemon.select(
    'name', 'type1', 'attack', 'defense',
    attack_rank = 
        pl.col('attack').rank(method='ordinal', descending=True).over('type1'),
    defense_rank = 
        pl.col('defense').rank(method='ordinal', descending=True).over('type1')
).head(6)

name,type1,attack,defense,attack_rank,defense_rank
str,str,i64,i64,u32,u32
"""Bulbasaur""","""grass""",49,49,61,62
"""Ivysaur""","""grass""",62,63,45,45
"""Venusaur""","""grass""",100,123,16,5
"""Charmander""","""fire""",52,43,45,44
"""Charmeleon""","""fire""",64,58,37,29
"""Charizard""","""fire""",104,78,10,14


#### Range function

In [27]:
# Add an increasing index for the pokemon of each type

pokemon.select(
    'name', 'type1',
    type_index = pl.arange(1, pl.count()+1).over('type1')
).head(6)

name,type1,type_index
str,str,i64
"""Bulbasaur""","""grass""",1
"""Ivysaur""","""grass""",2
"""Venusaur""","""grass""",3
"""Charmander""","""fire""",1
"""Charmeleon""","""fire""",2
"""Charizard""","""fire""",3


In [28]:
# Take a random sample of 2 pokemons for each group

pokemon.select('name','type1').filter(
    pl.arange(1, pl.count()+1).shuffle().over("type1") <= 2
).sort(by='type1').head(4)

name,type1
str,str
"""Wurmple""","""bug"""
"""Charjabug""","""bug"""
"""Liepard""","""dark"""
"""Deino""","""dark"""


In [29]:
# Take a random sample of 5% of pokemons for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).shuffle().over("type1") 
    <= pl.count().over("type1") * 0.05
).sort(by="type1").head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Venonat""","""bug""","""poison""","[""Compoundeyes"", ""Tinted Lens"", ""Run Away""]",60,55,50,45,0
"""Cascoon""","""bug""",,"[""Shed Skin""]",50,35,55,15,0
"""Buzzwole""","""bug""","""fighting""","[""Beast Boost""]",107,139,139,79,1
"""Malamar""","""dark""","""psychic""","[""Contrary"", ""Suction Cups"", ""Infiltrator""]",86,92,88,73,0
"""Jangmo-o""","""dragon""",,"[""Bulletproof"", ""Soundproof"", ""Overcoat""]",45,55,65,45,0
"""Tapu Koko""","""electric""","""fairy""","[""Electric Surge"", ""Telepathy""]",70,115,85,130,1


In [30]:
# Get the 5th pokemon for each group

pokemon.filter(
    pl.arange(1, pl.count() + 1).over("type1") == 5
).head(5)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Kakuna""","""bug""","""poison""","[""Shed Skin""]",45,25,50,35,0
"""Raticate""","""normal""","""dark""","[""Run Away"", ""Guts"", … ""Thick Fat""]",75,71,70,77,0
"""Nidoqueen""","""poison""","""ground""","[""Poison Point"", ""Rivalry"", ""Sheer Force""]",90,92,87,76,0
"""Ninetales""","""fire""","""ice""","[""Flash Fire"", ""Drought"", … ""Snow Warning""]",73,67,75,109,0
"""Gloom""","""grass""","""poison""","[""Chlorophyll"", ""Stench""]",60,65,70,40,0


#### Binning functions: Cut and Qcut

In [31]:
# Assigns the pokemon a category based on their attack stats:
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.select('name', 'type1', 'attack').hstack(
    pokemon.get_column('attack')
    .cut([50, 100], labels=['weak', 'medium', 'strong'], maintain_order=True)
    .select('break_point', 'category')
).head(6)

name,type1,attack,break_point,category
str,str,i64,f64,cat
"""Bulbasaur""","""grass""",49,50.0,"""weak"""
"""Ivysaur""","""grass""",62,100.0,"""medium"""
"""Venusaur""","""grass""",100,100.0,"""medium"""
"""Charmander""","""fire""",52,100.0,"""medium"""
"""Charmeleon""","""fire""",64,100.0,"""medium"""
"""Charizard""","""fire""",104,inf,"""strong"""


In [32]:
# Assigns the pokemon a category based on their attack stats using percentiles:
# bottom 40% - weak, top 20% - strong, rest - medium

pokemon.select('name', 'type1', 'attack').hstack(
    pokemon.get_column('attack')
    .qcut(quantiles=[0.4,0.8], labels=['weak', 'medium', 'strong'], maintain_order=True)
    .select('break_point', 'category')
).head(6)

name,type1,attack,break_point,category
str,str,i64,f64,cat
"""Bulbasaur""","""grass""",49,65.0,"""weak"""
"""Ivysaur""","""grass""",62,65.0,"""weak"""
"""Venusaur""","""grass""",100,104.0,"""medium"""
"""Charmander""","""fire""",52,65.0,"""weak"""
"""Charmeleon""","""fire""",64,65.0,"""weak"""
"""Charizard""","""fire""",104,104.0,"""medium"""


#### Repeat

In [33]:
# Use a different dataframe for this example
# Based on a dataframe with 3 fruits and their order quantities
# Repeat the fruit names based on their order quantities

df = pl.DataFrame(
    {
        "fruit": ["Apple", "Banana", "Cherry"],
        "order_quantity": [1, 3, 2],
    }
)

df

fruit,order_quantity
str,i64
"""Apple""",1
"""Banana""",3
"""Cherry""",2


In [34]:
df.with_columns(
    fruits_repeated = pl.col('fruit').repeat_by('order_quantity')
)

fruit,order_quantity,fruits_repeated
str,i64,list[str]
"""Apple""",1,"[""Apple""]"
"""Banana""",3,"[""Banana"", ""Banana"", ""Banana""]"
"""Cherry""",2,"[""Cherry"", ""Cherry""]"


#### Explode and Implode

In [35]:
# Explode the list of pokemon abilities: from a column of list to a column of values
# Point of attention when using explode: explode increases the number of rows
# All columns in a Polars dataframe should have the same number of rows

pokemon.head(3).select(
    pl.col('abilities')
)

abilities
list[str]
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"


In [36]:
pokemon.head(3).select(
    pl.col('abilities').explode()
)

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


In [37]:
# Implode is the opposite of explode, and aggregates values into lists

pokemon.head(3).select(
    pl.all().implode(),
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
list[str],list[str],list[str],list[list[str]],list[i64],list[i64],list[i64],list[i64],list[i64]
"[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]","[""grass"", ""grass"", ""grass""]","[""poison"", ""poison"", ""poison""]","[[""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""]]","[45, 60, 80]","[49, 62, 100]","[49, 63, 123]","[45, 60, 80]","[0, 0, 0]"


#### Conversion between structs and lists

In [38]:
# Combine type 1 and type 2 in a new struct column

pokemon.select(
    'name',
    type_is_legendary = pl.struct('type1', 'is_legendary')
).head(5)

name,type_is_legendary
str,struct[2]
"""Bulbasaur""","{""grass"",0}"
"""Ivysaur""","{""grass"",0}"
"""Venusaur""","{""grass"",0}"
"""Charmander""","{""fire"",0}"
"""Charmeleon""","{""fire"",0}"


In [39]:
# The struct column retains the type information of each initial column:
# Struct are views on the initial columns so the data is not copied
# The table schema shows that the names of the initial columns are retained 

pokemon.select(
    'name',
    type_is_legendary = pl.struct('type1', 'is_legendary')
).schema

{'name': Utf8,
 'type_is_legendary': Struct([Field('type1', Utf8), Field('is_legendary', Int64)])}

In [40]:
# Combine type 1 and type 2 in a new list column

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).head(5)

name,types
str,list[str]
"""Bulbasaur""","[""grass"", ""poison""]"
"""Ivysaur""","[""grass"", ""poison""]"
"""Venusaur""","[""grass"", ""poison""]"
"""Charmander""","[""fire"", null]"
"""Charmeleon""","[""fire"", null]"


In [41]:
# List columns do not retain the name of the initial columns
# In List columns, each row can have a different number of elements differently from Struct columns
# where each row has the same number of elements

pokemon.select(
    'name',
    types = pl.concat_list('type1', 'type2')
).schema

{'name': Utf8, 'types': List(Utf8)}

In [42]:
# List columns can be converted to a struct
# If number of elements are not the same, then the missing values are filled with null

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).head(5)

abilities
struct[6]
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"


In [43]:
# By default, the fields inside of the struct are called field_0, field_1, etc.

pokemon.select(
    pl.col('abilities').list.to_struct(n_field_strategy = 'max_width')
).schema

{'abilities': Struct([Field('field_0', Utf8), Field('field_1', Utf8), Field('field_2', Utf8), Field('field_3', Utf8), Field('field_4', Utf8), Field('field_5', Utf8)])}

In [44]:
# It's possible to change the name with the argument fields, for example calling them ability_0, ability_1, etc.

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width', fields = lambda i: f'ability_{i}')
).schema

{'abilities': Struct([Field('ability_0', Utf8), Field('ability_1', Utf8), Field('ability_2', Utf8), Field('ability_3', Utf8), Field('ability_4', Utf8), Field('ability_5', Utf8)])}

In [45]:
# We can un-nest struct columns and expand them into multiple columns

pokemon.select(
    pl.col('abilities')
    .list.to_struct(n_field_strategy = 'max_width')
).unnest('abilities').head(5)

field_0,field_1,field_2,field_3,field_4,field_5
str,str,str,str,str,str
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Overgrow""","""Chlorophyll""",,,,
"""Blaze""","""Solar Power""",,,,
"""Blaze""","""Solar Power""",,,,


#### Advanced reshaping: Pivot, Melt, Unstack and Transpose

In [46]:
# Pivot type 2 to the columns and count the number of pokemons for each type 1 and type 2 combination

pokemon.head(20).pivot(
    values = 'name',
    index = 'type1',
    columns = 'type2',
    aggregate_function=pl.element().count()
)


type1,poison,null,flying,dark
str,u32,u32,u32,u32
"""grass""",3.0,,,
"""fire""",,2.0,1.0,
"""water""",,3.0,,
"""bug""",3.0,2.0,1.0,
"""normal""",,,3.0,2.0


In [47]:
# Melt is the opposite of pivot, it brings the header of multiple columns into one column
# and their value in another column

pokemon.melt(
    id_vars = ['name', 'type1', 'type2'],
    value_vars = ['hp','attack', 'defense'],
    variable_name = 'stat',
).sort(by='name').head(6)

name,type1,type2,stat,value
str,str,str,str,i64
"""Abomasnow""","""grass""","""ice""","""hp""",90
"""Abomasnow""","""grass""","""ice""","""attack""",132
"""Abomasnow""","""grass""","""ice""","""defense""",105
"""Abra""","""psychic""",,"""hp""",25
"""Abra""","""psychic""",,"""attack""",20
"""Abra""","""psychic""",,"""defense""",15


In [48]:
# Unstack breaks the dataframe into multiple groups of the same size 
# and moves these groups to new columns
# Here we split the dataframe into groups of 3, and add a column with the level of the pokemon

(
    pokemon.select('name', 'attack').head(9)
    .unstack(columns = ['name', 'attack'], step=3, how="vertical")
    .with_columns(
        level = pl.arange(1,4)
        )
)

name_0,name_1,name_2,attack_0,attack_1,attack_2,level
str,str,str,i64,i64,i64,i64
"""Bulbasaur""","""Charmander""","""Squirtle""",49,52,48,1
"""Ivysaur""","""Charmeleon""","""Wartortle""",62,64,63,2
"""Venusaur""","""Charizard""","""Blastoise""",100,104,103,3


In [49]:
# Transpose inverses the rows and columns of a dataframe
# It's a computationally expensive operation, so it should be used only if no other option is available

pokemon.head(3).select(
    'name', 'type1', 'type2'
    ).transpose(include_header= True)

column,column_0,column_1,column_2
str,str,str,str
"""name""","""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""type1""","""grass""","""grass""","""grass"""
"""type2""","""poison""","""poison""","""poison"""


#### Merge DataFrames: hstack, vstack, extend, concat, join

In [77]:
# Horizontally stack 2 dataframes
# We have a new dataframe with the pokemon color

pokemon_color = pl.DataFrame({
    'color': ['green', 'green', 'green', 'red', 'red', 'red']
})

pokemon.select('name','type1','type2','abilities').head(6).hstack(pokemon_color)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]","""red"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]","""red"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]","""red"""


In [51]:
# Vertically stack 2 dataframes
# We have a new dataframe with a new pokemon

new_pokemon = pl.DataFrame({
    'name': ['Polarizard'],
    'type1': ['ice'],
    'type2': ['flying'],
    'abilities': [['snow warning', 'blaze']]
})

pokemon.select('name','type1','type2','abilities').vstack(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [52]:
# Extend is similar to vstack, but it copies the data instead of referencing it

pokemon.select('name','type1','type2','abilities').extend(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [53]:
# concat can reproduce both hstack and vstack when rechunk is set to False
# when rechunk is set to True, all data is copied to a contiguous memory space which allows it to be faster

pl.concat(
    [pokemon.select('name','type1','type2','abilities'), new_pokemon],
    rechunk = True,
    how= 'vertical'
).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [78]:
# concat diagonal stacks dataframes diagonally
# this means that columns missing from one dataframe are filled with nulls

new_pokemon_color = pl.DataFrame({
    'name': ['Polarizard'],
    'abilities': [['Snow warning', 'Blaze']],
    'color': ['white']
})

pl.concat([
    pokemon.select('name','type1','type2','abilities'),
    new_pokemon_color
    ],
    how = 'diagonal'
).tail(5)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Kartana""","""grass""","""steel""","[""Beast Boost""]",
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]",
"""Necrozma""","""psychic""",,"[""Prism Armor""]",
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]",
"""Polarizard""",,,"[""Snow warning"", ""Blaze""]","""white"""


In [79]:
# concat has another method, which is align.
# the align method ensures that the columns of the 2 dataframes are in the same order

pokemon_new_order = pl.DataFrame({
    'name' : ['Bulbasaur', 'Charmander', 'Squirtle'],
    'color': ['green', 'red', 'blue']
})

pl.concat([
    pokemon.select('name','type1',).head(4),
    pokemon_new_order
], how='align'
)

name,type1,color
str,str,str
"""Bulbasaur""","""grass""","""green"""
"""Charmander""","""fire""","""red"""
"""Ivysaur""","""grass""",
"""Squirtle""",,"""blue"""
"""Venusaur""","""grass""",


In [56]:
# join reproduces SQL joins, such as inner, left, outer, semi, anti, cross
# inner join keeps only the rows that are present in both dataframes

pokemon_new = pl.DataFrame({
    'name' : ['Bulbasaur', 'Polarizard'],
    'color': ['green', 'white']
})

pokemon.head(3).join(pokemon_new, on = 'name', how = 'inner')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""


In [57]:
# left join keeps all rows from the left dataframe and fills the missing values with nulls
# to keep the rows from the right dataframe, we can inverse the order of the dataframes

pokemon.head(3).join(pokemon_new, on = 'name', how = 'left')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,


In [58]:
# outer join keeps all rows from both dataframes and fills the missing values with nulls

pokemon.head(3).join(pokemon_new, on = 'name', how = 'outer')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45.0,49.0,49.0,45.0,0.0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60.0,62.0,63.0,60.0,0.0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80.0,100.0,123.0,80.0,0.0,
"""Polarizard""",,,,,,,,,"""white"""


In [59]:
# semi keeps the rows from the left dataframe that are present in the right dataframe
# it does not add any columns from the right dataframe (differently from inner)

pokemon.head(3).join(pokemon_new, on = 'name', how = 'semi')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0


In [60]:
# anti keeps the rows present in either dataframe but not in both
# it keeps the opposite rows as inner

pokemon.head(3).join(pokemon_new, on = 'name', how = 'anti')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


In [81]:
# cross combines all rows of the first dataframe with all rows of the second dataframe

pokemon_trainers = pl.DataFrame({
    'trainer': ['trainer1', 'trainer2']
})

pokemon.head(2).join(pokemon_trainers, how = 'cross')

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,trainer
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer1"""
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer2"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer1"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer2"""


#### Custom functions: apply, map

In [62]:
# Both apply and map will be slower than using the native polars functions. 
# It's recommended to avoid apply and map whenever possible
# a common use case for apply and map is passing data to a third-party library
# map applies a function to a full column (which is a Polars series)
# apply does the same, but one row at a time
# map is faster than apply

In [82]:
# increase the attack by 10% for pokemons with attack < 50
# in the first 2 functions, we receive a column (a list) and process it using a Python list comprehension
# the first function returns a Python list, the second a Polars series
# the last function receives a single value and returns a single value

In [88]:
def simulated_attack_list(attack_column):
    return [attack * 1.1 if attack < 50 else attack for attack in attack_column]

pokemon.select(
    'name','attack',
    simulated_attack = pl.col('attack').map(simulated_attack_list)
).head(3)

name,attack,simulated_attack
str,i64,list[f64]
"""Bulbasaur""",49,"[53.9, 62.0, … 95.0]"
"""Ivysaur""",62,"[53.9, 62.0, … 95.0]"
"""Venusaur""",100,"[53.9, 62.0, … 95.0]"


In [89]:
def simulated_attack_series(attack_column):
    return pl.Series([attack * 1.1 if attack < 50 else attack for attack in attack_column])

pokemon.select(
    'name','attack',
    simulated_attack = pl.col('attack').map(simulated_attack_series)
).head(3)

name,attack,simulated_attack
str,i64,f64
"""Bulbasaur""",49,53.9
"""Ivysaur""",62,62.0
"""Venusaur""",100,100.0


In [90]:
def simulated_attack_single_value(attack_number):
    return attack_number * 1.1 if attack_number < 50 else attack_number

pokemon.select(
    'name','attack',    
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
).head(3)

name,attack,simulated_attack
str,i64,f64
"""Bulbasaur""",49,53.9
"""Ivysaur""",62,62.0
"""Venusaur""",100,100.0


In [67]:
# let's check their speed
# we create a bigger dataframe by repeating the original one 100 times
# no need to test the first function, as the result is not what we want

pokemon_100 = pl.concat([pokemon] * 100, rechunk = True)

In [68]:
%%timeit
pokemon_100.select(
    'name', 'attack'
    simulated_attack = pl.col('attack').map(simulated_attack_series)
)

13.2 ms ± 401 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
%%timeit
pokemon_100.select(
    'name', 'attack'
    simulated_attack = pl.col('attack').apply(simulated_attack_single_value)
)

17.3 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [70]:
%%timeit
pokemon_100.select(
    'name', 'attack'
    simulated_attack = pl.when(pl.col('attack') < 50).then(pl.col('attack') * 1.1).otherwise(pl.col('attack'))
)

1.94 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [71]:
# apply and map are considerably slower than the native Polars functions
# in this case by a factor of 10
# map is faster than apply because we operate on the full column instead of one row at a time

In [72]:
# inside groupby, we can use apply to apply a function to each group

def simulated_attack_list_to_scalar(attack_column):
    return sum([attack * 1.1 if attack < 50 else attack for attack in attack_column])

pokemon.head(6).groupby('type1').agg(
    pl.col('attack').apply(simulated_attack_list_to_scalar, return_dtype = pl.Float64)
)

type1,attack
str,f64
"""fire""",220.0
"""grass""",215.9


In [93]:
# if we want to apply a function to multiple columns, we can use pl.struct
# to create a struct, then use apply with it

pokemon.head(3).select(
    'name','attack','defense',
    attack_plus_defense = pl.struct('attack','defense')
    .apply(lambda columns: columns['attack'] + columns['defense'])
)

name,attack,defense,attack_plus_defense
str,i64,i64,i64
"""Bulbasaur""",49,49,98
"""Ivysaur""",62,63,125
"""Venusaur""",100,123,223


In [94]:
# we can also use lru_cache to cache the results of a function
# this is useful when we have a function that is called multiple times with the same arguments
# in this case, we have a function that returns a list of the characters of the type

from functools import lru_cache

@lru_cache(maxsize = 2048)
def modify_type_cached(type_of_pokemon):
    return list(type_of_pokemon)

def modify_type(type_of_pokemon):
    return list(type_of_pokemon)

In [95]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type)
)

1.07 s ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [96]:
%%timeit
pokemon_100.select(
    pl.col('type1').apply(modify_type_cached)
)

813 ms ± 53.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
