### Intermediate Data Transformation with Polars

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code.

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [4]:
import polars as pl

# Configure the number of characters to show for each string column, and the number of decimals to show for float columns
pl.Config.set_fmt_str_lengths(30)
pl.Config.set_float_precision(2)

polars.config.Config

In [5]:
pokemon = pl.read_parquet("../datasets/pokemon_simplified.parquet")

In [6]:
pokemon.head(3)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


#### Conditions

In [7]:
# Identify the top 25% of Pokemon based on the attack stats
# Use the quantile function to get the 75th percentile

pokemon.select(
    "name",
    "attack",
    attack_category=pl.when(pl.col("attack") > pl.col("attack").quantile(0.75)).then(
        pl.lit("Top 25%")
    ),
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,
"""Ivysaur""",62,
"""Venusaur""",100,
"""Charmander""",52,
"""Charmeleon""",64,
"""Charizard""",104,"""Top 25%"""


In [8]:
# Identify the top 25% of Pokemon based on the attack stats
# Use otherwise to assign the remaining 75% of Pokemon to the 'Other' category

pokemon.select(
    "name",
    "attack",
    attack_category=pl.when(pl.col("attack") > pl.col("attack").quantile(0.75))
    .then(pl.lit("Top 25%"))
    .otherwise(pl.lit("Other")),
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Other"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Other"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [9]:
# Identify the top 25% of Pokemon based on the attack stats AND the bottom 25%
# Use multiple when then statements

pokemon.select(
    "name",
    "attack",
    attack_category=pl.when(pl.col("attack") > pl.col("attack").quantile(0.75))
    .then(pl.lit("Top 25%"))
    .when(pl.col("attack") < pl.col("attack").quantile(0.25))
    .then(pl.lit("Bottom 25%"))
    .otherwise(pl.lit("Other")),
).head(6)

name,attack,attack_category
str,i64,str
"""Bulbasaur""",49,"""Bottom 25%"""
"""Ivysaur""",62,"""Other"""
"""Venusaur""",100,"""Other"""
"""Charmander""",52,"""Bottom 25%"""
"""Charmeleon""",64,"""Other"""
"""Charizard""",104,"""Top 25%"""


In [10]:
# We can use replace instead of pl.when().then() to map colors to Pokemon types

type_colors = {"grass": "green", "fire": "red"}

pokemon.select("name", "type1", color=pl.col("type1").replace(type_colors)).head(7)

name,type1,color
str,str,str
"""Bulbasaur""","""grass""","""green"""
"""Ivysaur""","""grass""","""green"""
"""Venusaur""","""grass""","""green"""
"""Charmander""","""fire""","""red"""
"""Charmeleon""","""fire""","""red"""
"""Charizard""","""fire""","""red"""
"""Squirtle""","""water""","""water"""


#### One hot encoding

In [11]:
# Convert the column type1 to dummies

pokemon.head(6).select("name", "type1").to_dummies(columns=["type1"])

name,type1_fire,type1_grass
str,u8,u8
"""Bulbasaur""",0,1
"""Ivysaur""",0,1
"""Venusaur""",0,1
"""Charmander""",1,0
"""Charmeleon""",1,0
"""Charizard""",1,0


#### Aggregating data

In [12]:
# Group by type and calculate median stats
# Group by is run in parallel so the order of the rows can change

pokemon.group_by("type1").agg(pl.col("hp", "attack", "defense", "speed").median()).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""dragon""",75.0,100.0,90.0,80.0
"""flying""",79.0,70.0,80.0,121.0
"""normal""",70.0,75.0,60.0,68.0


In [13]:
# Group by type and calculate median stats
# Maintaining the order is possible but it is slower

pokemon.group_by("type1", maintain_order=True).agg(
    pl.col("hp", "attack", "defense", "speed").median()
).head(3)

type1,hp,attack,defense,speed
str,f64,f64,f64,f64
"""grass""",65.0,68.0,66.0,55.0
"""fire""",65.0,79.5,61.5,72.5
"""water""",68.0,70.0,70.0,64.0


In [14]:
# Group on multiple columns works the same way

pokemon.group_by(["type1", "type2"]).agg(
    pl.col("hp", "attack", "defense", "speed").median()
).head(3)

type1,type2,hp,attack,defense,speed
str,str,f64,f64,f64,f64
"""grass""","""dark""",70.0,100.0,60.0,60.0
"""dragon""","""fighting""",65.0,92.5,107.5,75.0
"""fire""","""ground""",65.0,90.0,70.0,27.5


In [15]:
# Lots of aggregations functions are available

pokemon.group_by("type1").agg(
    avg_attack=pl.col("attack").mean(),
    max_attack=pl.col("attack").max(),
    quantile_75=pl.col("attack").quantile(0.75),
    count=pl.count(),
    first_pokemon=pl.first("name"),
    strongest_pokemon=pl.col("name").sort_by("attack", descending=True).first(),
).head(3)

  count=pl.count(),


type1,avg_attack,max_attack,quantile_75,count,first_pokemon,strongest_pokemon
str,f64,i64,f64,u32,str,str
"""fairy""",62.11,131,72.0,18,"""Clefairy""","""Xerneas"""
"""psychic""",65.57,165,85.0,53,"""Abra""","""Gallade"""
"""flying""",66.67,100,100.0,3,"""Tornadus""","""Tornadus"""


In [16]:
# Possible to use Numpy universal functions (ufuncs) in aggregations

import numpy as np

pokemon.group_by("type1").agg(
    sqrt_attack_mean=np.sqrt(
        pl.col("attack")
    ).mean()  # sqrt is the square root function
).head(3)

type1,sqrt_attack_mean
str,f64
"""dragon""",10.19
"""electric""",8.28
"""dark""",9.28


In [17]:
# If aggregation functions are not specified, the result if aggregated in a list

pokemon.group_by("type1").agg(pl.col("name").head(3)).head(3)

type1,name
str,list[str]
"""fire""","[""Charmander"", ""Charmeleon"", ""Charizard""]"
"""dragon""","[""Dratini"", ""Dragonair"", ""Dragonite""]"
"""grass""","[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]"


In [18]:
# Concatenate the names of the pokemon into a string

pokemon.group_by("type1").agg(pl.col("name").head(3).str.concat(", ")).head(3)

type1,name
str,str
"""fairy""","""Clefairy, Clefable, Cleffa"""
"""ghost""","""Gastly, Haunter, Gengar"""
"""fire""","""Charmander, Charmeleon, Chariz…"


In [19]:
# Aggregation with conditions
# Calculate the average attack for Pokemons with a speed above 100, by type

pokemon.group_by("type1").agg(
    avg_attack_when_speed_above_50=pl.col("attack").filter(pl.col("speed") > 50).mean(),
    avg_attack_when_speed_above_100=pl.col("attack")
    .filter(pl.col("speed") > 100)
    .mean(),
).head(3)

type1,avg_attack_when_speed_above_50,avg_attack_when_speed_above_100
str,f64,f64
"""poison""",76.95,77.0
"""ghost""",77.14,62.5
"""fighting""",102.53,120.67


In [20]:
# We can also create functions to use in aggregations


def avg_attack_when_speed_above_threshold(speed):
    result = (
        pl.col("attack")
        .filter(pl.col("speed") > speed)
        .mean()
        .alias(f"avg_attack_when_speed_above_{speed}")
    )
    return result


pokemon.group_by("type1").agg(
    avg_attack_when_speed_above_threshold(speed) for speed in [50, 100]
).head(3)

type1,avg_attack_when_speed_above_50,avg_attack_when_speed_above_100
str,f64,f64
"""fairy""",72.25,
"""steel""",106.1,117.5
"""flying""",66.67,85.0


In [21]:
# Show the top 3 pokemon by attack and the top 3 pokemon by health in each group
(
    pokemon.group_by("type1").agg(
        top_3_pokemon_by_attack=pl.col("name")
        .sort_by("attack", descending=True)
        .head(3),
        top_3_pokemon_by_defense=pl.col("name")
        .sort_by("defense", descending=True)
        .head(3),
    )
).head(3)

type1,top_3_pokemon_by_attack,top_3_pokemon_by_defense
str,list[str],list[str]
"""flying""","[""Tornadus"", ""Noivern"", ""Noibat""]","[""Tornadus"", ""Noivern"", ""Noibat""]"
"""ground""","[""Groudon"", ""Landorus"", ""Rhyperior""]","[""Groudon"", ""Rhyperior"", ""Gliscor""]"
"""steel""","[""Aegislash"", ""Metagross"", ""Aggron""]","[""Steelix"", ""Aggron"", ""Metagross""]"


#### Window functions

In [22]:
# For each pokemon show its attack, the average attack for pokemons of the same type 1
# and the ratio of the attack of the pokemon vs the average attack for pokemons of the same type 1

pokemon.select(
    "name",
    "type1",
    "attack",
    avg_attack_for_same_type1=pl.mean("attack").over("type1"),
    ratio_attack_vs_avg_attack=(pl.col("attack") / pl.mean("attack")).over("type1"),
).head(5)

name,type1,attack,avg_attack_for_same_type1,ratio_attack_vs_avg_attack
str,str,i64,f64,f64
"""Bulbasaur""","""grass""",49,73.77,0.66
"""Ivysaur""","""grass""",62,73.77,0.84
"""Venusaur""","""grass""",100,73.77,1.36
"""Charmander""","""fire""",52,81.5,0.64
"""Charmeleon""","""fire""",64,81.5,0.79


In [23]:
# For each pokemon show its attack and the average attack for pokemons of the same type 1 and pokemons of the same type 2

pokemon.select(
    "name",
    "type1",
    "type2",
    "attack",
    avg_attack_for_same_type1=pl.mean("attack").over("type1"),
    avg_attack_for_same_type2=pl.mean("attack").over("type2"),
).head(5)

name,type1,type2,attack,avg_attack_for_same_type1,avg_attack_for_same_type2
str,str,str,i64,f64,f64
"""Bulbasaur""","""grass""","""poison""",49,73.77,67.62
"""Ivysaur""","""grass""","""poison""",62,73.77,67.62
"""Venusaur""","""grass""","""poison""",100,73.77,67.62
"""Charmander""","""fire""",,52,81.5,74.23
"""Charmeleon""","""fire""",,64,81.5,74.23


In [24]:
df = pl.DataFrame({"group": ["A", "A", "B", "B", "A"], "value": [1, 2, 3, 4, 5]})

In [25]:
# For each pokemon, show the top 3 pokemon by attack of the same type

pokemon.select(
    "name",
    "type1",
    "attack",
    top_3_attack_pokemon_same_type=pl.col("name")
    .sort_by("attack", descending=True)
    .head(3)
    .over("type1", mapping_strategy="join"),
).head(5)

name,type1,attack,top_3_attack_pokemon_same_type
str,str,i64,list[str]
"""Bulbasaur""","""grass""",49,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Ivysaur""","""grass""",62,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Venusaur""","""grass""",100,"[""Kartana"", ""Abomasnow"", ""Breloom""]"
"""Charmander""","""fire""",52,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"
"""Charmeleon""","""fire""",64,"[""Blaziken"", ""Flareon"", ""Ho-Oh""]"


In [26]:
# Show top 3 pokemon by attack and defense

pokemon.select(
    pl.col("type1").head(3).over("type1", mapping_strategy="explode"),
    top_3_attack=pl.col("name")
    .sort_by("attack", descending=True)
    .head(3)
    .over("type1", mapping_strategy="explode"),
    top_3_defense=pl.col("name")
    .sort_by("defense", descending=True)
    .head(3)
    .over("type1", mapping_strategy="explode"),
).head(6)

type1,top_3_attack,top_3_defense
str,str,str
"""grass""","""Kartana""","""Ferrothorn"""
"""grass""","""Abomasnow""","""Kartana"""
"""grass""","""Breloom""","""Leafeon"""
"""fire""","""Blaziken""","""Torkoal"""
"""fire""","""Flareon""","""Turtonator"""
"""fire""","""Ho-Oh""","""Magcargo"""


In [27]:
# Rank the pokemon by attack and defense by type

pokemon.select(
    "name",
    "type1",
    "attack",
    "defense",
    attack_rank=pl.col("attack").rank(method="ordinal", descending=True).over("type1"),
    defense_rank=pl.col("defense")
    .rank(method="ordinal", descending=True)
    .over("type1"),
).head(6)

name,type1,attack,defense,attack_rank,defense_rank
str,str,i64,i64,u32,u32
"""Bulbasaur""","""grass""",49,49,61,62
"""Ivysaur""","""grass""",62,63,45,45
"""Venusaur""","""grass""",100,123,16,5
"""Charmander""","""fire""",52,43,45,44
"""Charmeleon""","""fire""",64,58,37,29
"""Charizard""","""fire""",104,78,10,14


#### Range function

In [28]:
# Add an increasing index for the pokemon of each type

pokemon.select(
    "name", "type1", type_index=pl.int_range(1, pl.len() + 1).over("type1")
).head(6)

name,type1,type_index
str,str,i64
"""Bulbasaur""","""grass""",1
"""Ivysaur""","""grass""",2
"""Venusaur""","""grass""",3
"""Charmander""","""fire""",1
"""Charmeleon""","""fire""",2
"""Charizard""","""fire""",3


In [29]:
# Take a random sample of 2 pokemons for each group

pokemon.select("name", "type1").filter(
    pl.int_range(1, pl.len() + 1).shuffle().over("type1") <= 2
).sort(by="type1").head(4)

name,type1
str,str
"""Parasect""","""bug"""
"""Shedinja""","""bug"""
"""Zorua""","""dark"""
"""Guzzlord""","""dark"""


In [30]:
# Take a random sample of 5% of pokemons for each group

pokemon.filter(
    pl.int_range(1, pl.len() + 1).shuffle().over("type1")
    <= pl.len().over("type1") * 0.05
).sort(by="type1").head(6)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Wormadam""","""bug""","""grass""","[""Anticipation"", ""Overcoat""]",60,69,95,36,0
"""Larvesta""","""bug""","""fire""","[""Flame Body"", ""Swarm""]",55,85,55,60,0
"""Vivillon""","""bug""","""flying""","[""Shield Dust"", ""Compoundeyes"", ""Friend Guard""]",80,52,50,89,0
"""Inkay""","""dark""","""psychic""","[""Contrary"", ""Suction Cups"", ""Infiltrator""]",53,54,53,45,0
"""Reshiram""","""dragon""","""fire""","[""Turboblaze""]",100,120,100,90,1
"""Electrike""","""electric""",,"[""Static"", ""Lightningrod"", ""Minus""]",40,45,40,65,0


#### Binning functions: Cut and Qcut

In [31]:
# Assigns the pokemon a category based on their attack stats:
# <= 50 - weak, <= 100 - medium, > 100 - strong

pokemon.select(
    "name",
    "type1",
    "attack",
    attack_category=pl.col("attack").cut(
        [50, 100], labels=["weak", "medium", "strong"]
    ),
).head(6)

name,type1,attack,attack_category
str,str,i64,cat
"""Bulbasaur""","""grass""",49,"""weak"""
"""Ivysaur""","""grass""",62,"""medium"""
"""Venusaur""","""grass""",100,"""medium"""
"""Charmander""","""fire""",52,"""medium"""
"""Charmeleon""","""fire""",64,"""medium"""
"""Charizard""","""fire""",104,"""strong"""


In [32]:
# Assigns the pokemon a category based on their attack stats using percentiles:
# bottom 40% - weak, top 20% - strong, rest - medium

pokemon.select(
    "name",
    "type1",
    "attack",
    attack_category_percentiles=pl.col("attack").qcut(
        [0.4, 0.8], labels=["weak", "medium", "strong"]
    ),
).head(6)

name,type1,attack,attack_category_percentiles
str,str,i64,cat
"""Bulbasaur""","""grass""",49,"""weak"""
"""Ivysaur""","""grass""",62,"""weak"""
"""Venusaur""","""grass""",100,"""medium"""
"""Charmander""","""fire""",52,"""weak"""
"""Charmeleon""","""fire""",64,"""weak"""
"""Charizard""","""fire""",104,"""medium"""


#### Repeat

In [33]:
# Use a different dataframe for this example
# Based on a dataframe with 3 fruits and their order quantities
# Repeat the fruit names based on their order quantities

df = pl.DataFrame(
    {
        "fruit": ["Apple", "Banana", "Cherry"],
        "order_quantity": [1, 3, 2],
    }
)

df

fruit,order_quantity
str,i64
"""Apple""",1
"""Banana""",3
"""Cherry""",2


In [34]:
df.with_columns(fruits_repeated=pl.col("fruit").repeat_by("order_quantity"))

fruit,order_quantity,fruits_repeated
str,i64,list[str]
"""Apple""",1,"[""Apple""]"
"""Banana""",3,"[""Banana"", ""Banana"", ""Banana""]"
"""Cherry""",2,"[""Cherry"", ""Cherry""]"


#### Explode and Implode

In [35]:
# Explode the list of pokemon abilities: from a column of list to a column of values
# Point of attention when using explode: explode increases the number of rows
# All columns in a Polars dataframe should have the same number of rows

pokemon.head(3).select(pl.col("abilities"))

abilities
list[str]
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"
"[""Overgrow"", ""Chlorophyll""]"


In [36]:
pokemon.head(3).select(pl.col("abilities").explode())

abilities
str
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""
"""Overgrow"""
"""Chlorophyll"""


In [37]:
# Implode is the opposite of explode, and aggregates values into lists

pokemon.head(3).select(
    pl.all().implode(),
)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
list[str],list[str],list[str],list[list[str]],list[i64],list[i64],list[i64],list[i64],list[i64]
"[""Bulbasaur"", ""Ivysaur"", ""Venusaur""]","[""grass"", ""grass"", ""grass""]","[""poison"", ""poison"", ""poison""]","[[""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""], [""Overgrow"", ""Chlorophyll""]]","[45, 60, 80]","[49, 62, 100]","[49, 63, 123]","[45, 60, 80]","[0, 0, 0]"


#### Conversion between structs and lists

In [38]:
# Combine type 1 and type 2 in a new struct column

pokemon.select("name", type_is_legendary=pl.struct("type1", "is_legendary")).head(5)

name,type_is_legendary
str,struct[2]
"""Bulbasaur""","{""grass"",0}"
"""Ivysaur""","{""grass"",0}"
"""Venusaur""","{""grass"",0}"
"""Charmander""","{""fire"",0}"
"""Charmeleon""","{""fire"",0}"


In [39]:
# The struct column retains the type information of each initial column:
# Struct are views on the initial columns so the data is not copied
# The table schema shows that the names of the initial columns are retained

pokemon.select("name", type_is_legendary=pl.struct("type1", "is_legendary")).schema

Schema([('name', String),
        ('type_is_legendary',
         Struct({'type1': String, 'is_legendary': Int64}))])

In [40]:
# Combine type 1 and type 2 in a new list column

pokemon.select("name", types=pl.concat_list("type1", "type2")).head(5)

name,types
str,list[str]
"""Bulbasaur""","[""grass"", ""poison""]"
"""Ivysaur""","[""grass"", ""poison""]"
"""Venusaur""","[""grass"", ""poison""]"
"""Charmander""","[""fire"", null]"
"""Charmeleon""","[""fire"", null]"


In [41]:
# List columns do not retain the name of the initial columns
# In List columns, each row can have a different number of elements differently from Struct columns
# where each row has the same number of elements

pokemon.select("name", types=pl.concat_list("type1", "type2")).schema

Schema([('name', String), ('types', List(String))])

In [42]:
pokemon.select("name", type_is_legendary=pl.struct("type1", "is_legendary")).unnest(
    "type_is_legendary"
).head(5)

name,type1,is_legendary
str,str,i64
"""Bulbasaur""","""grass""",0
"""Ivysaur""","""grass""",0
"""Venusaur""","""grass""",0
"""Charmander""","""fire""",0
"""Charmeleon""","""fire""",0


In [43]:
# List columns can be converted to a struct
# If number of elements are not the same, then the missing values are filled with null

pokemon.select(pl.col("abilities").list.to_struct(n_field_strategy="max_width")).head(5)

abilities
struct[6]
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Overgrow"",""Chlorophyll"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"
"{""Blaze"",""Solar Power"",null,null,null,null}"


In [44]:
# By default, the fields inside of the struct are called field_0, field_1, etc.

pokemon.select(pl.col("abilities").list.to_struct(n_field_strategy="max_width")).schema

Schema([('abilities',
         Struct({'field_0': String, 'field_1': String, 'field_2': String, 'field_3': String, 'field_4': String, 'field_5': String}))])

In [45]:
# It's possible to change the name with the argument fields, for example calling them ability_0, ability_1, etc.

pokemon.select(
    pl.col("abilities").list.to_struct(
        n_field_strategy="max_width", fields=lambda i: f"ability_{i}"
    )
).schema

Schema([('abilities',
         Struct({'ability_0': String, 'ability_1': String, 'ability_2': String, 'ability_3': String, 'ability_4': String, 'ability_5': String}))])

In [46]:
# We can un-nest struct columns and expand them into multiple columns

pokemon.select(
    "name",
    pl.col("abilities").list.to_struct(
        n_field_strategy="max_width", fields=lambda i: f"ability_{i}"
    ),
).unnest("abilities").head(5)

name,ability_0,ability_1,ability_2,ability_3,ability_4,ability_5
str,str,str,str,str,str,str
"""Bulbasaur""","""Overgrow""","""Chlorophyll""",,,,
"""Ivysaur""","""Overgrow""","""Chlorophyll""",,,,
"""Venusaur""","""Overgrow""","""Chlorophyll""",,,,
"""Charmander""","""Blaze""","""Solar Power""",,,,
"""Charmeleon""","""Blaze""","""Solar Power""",,,,


#### Advanced reshaping: Pivot, Melt, Unstack and Transpose

In [47]:
# Pivot type 2 to the columns and count the number of pokemons for each type 1 and type 2 combination

pokemon.head(20).pivot(
    values="name",
    index="type1",
    columns="type2",
    aggregate_function=pl.element().count(),
)

  pokemon.head(20).pivot(


type1,poison,null,flying,dark
str,u32,u32,u32,u32
"""grass""",3.0,,,
"""fire""",,2.0,1.0,
"""water""",,3.0,,
"""bug""",3.0,2.0,1.0,
"""normal""",,,3.0,2.0


In [48]:
# Melt is the opposite of pivot, it brings the header of multiple columns into one column
# and their value in another column

pokemon.melt(
    id_vars=["name", "type1", "type2"],
    value_vars=["hp", "attack", "defense"],
    variable_name="stat",
).sort(by="name").head(6)

  pokemon.melt(


name,type1,type2,stat,value
str,str,str,str,i64
"""Abomasnow""","""grass""","""ice""","""hp""",90
"""Abomasnow""","""grass""","""ice""","""attack""",132
"""Abomasnow""","""grass""","""ice""","""defense""",105
"""Abra""","""psychic""",,"""hp""",25
"""Abra""","""psychic""",,"""attack""",20
"""Abra""","""psychic""",,"""defense""",15


In [49]:
# Unstack breaks the dataframe into multiple groups of the same size
# and moves these groups to new columns
# Here we split the dataframe into groups of 3, and add a column with the level of the pokemon

(
    pokemon.select("name", "attack")
    .head(9)
    .unstack(columns=["name", "attack"], step=3, how="vertical")
    .with_columns(level=pl.int_range(1, 4))
)

name_0,name_1,name_2,attack_0,attack_1,attack_2,level
str,str,str,i64,i64,i64,i64
"""Bulbasaur""","""Charmander""","""Squirtle""",49,52,48,1
"""Ivysaur""","""Charmeleon""","""Wartortle""",62,64,63,2
"""Venusaur""","""Charizard""","""Blastoise""",100,104,103,3


In [50]:
# Transpose inverses the rows and columns of a dataframe
# It's a computationally expensive operation, so it should be used only if no other option is available

pokemon.head(3).select("name", "type1", "attack").transpose(include_header=True)

column,column_0,column_1,column_2
str,str,str,str
"""name""","""Bulbasaur""","""Ivysaur""","""Venusaur"""
"""type1""","""grass""","""grass""","""grass"""
"""attack""","""49""","""62""","""100"""


#### Merge DataFrames: hstack, vstack, extend, concat, join

In [51]:
# Horizontally stack 2 dataframes
# We have a new dataframe with the pokemon color

pokemon_color = pl.DataFrame(
    {"color": ["green", "green", "green", "red", "red", "red"]}
)

pokemon.select("name", "type1", "type2", "abilities").head(6).hstack(pokemon_color)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]","""green"""
"""Charmander""","""fire""",,"[""Blaze"", ""Solar Power""]","""red"""
"""Charmeleon""","""fire""",,"[""Blaze"", ""Solar Power""]","""red"""
"""Charizard""","""fire""","""flying""","[""Blaze"", ""Solar Power""]","""red"""


In [52]:
# Vertically stack 2 dataframes
# We have a new dataframe with a new pokemon

new_pokemon = pl.DataFrame(
    {
        "name": ["Polarizard"],
        "type1": ["ice"],
        "type2": ["flying"],
        "abilities": [["snow warning", "blaze"]],
    }
)

pokemon.select("name", "type1", "type2", "abilities").vstack(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [53]:
# Extend is similar to vstack, but it copies the data instead of referencing it

pokemon.select("name", "type1", "type2", "abilities").extend(new_pokemon).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [54]:
# concat can reproduce both hstack and vstack when rechunk is set to False
# when rechunk is set to True, all data is copied to a contiguous memory space which allows it to be faster

pl.concat(
    [pokemon.select("name", "type1", "type2", "abilities"), new_pokemon],
    rechunk=True,
    how="vertical",
).tail(5)

name,type1,type2,abilities
str,str,str,list[str]
"""Kartana""","""grass""","""steel""","[""Beast Boost""]"
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]"
"""Necrozma""","""psychic""",,"[""Prism Armor""]"
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]"
"""Polarizard""","""ice""","""flying""","[""snow warning"", ""blaze""]"


In [55]:
# concat diagonal stacks dataframes diagonally
# this means that columns missing from one dataframe are filled with nulls

new_pokemon_color = pl.DataFrame(
    {
        "name": ["Polarizard"],
        "abilities": [["Snow warning", "Blaze"]],
        "color": ["white"],
    }
)

pl.concat(
    [pokemon.select("name", "type1", "type2", "abilities"), new_pokemon_color],
    how="diagonal",
).tail(5)

name,type1,type2,abilities,color
str,str,str,list[str],str
"""Kartana""","""grass""","""steel""","[""Beast Boost""]",
"""Guzzlord""","""dark""","""dragon""","[""Beast Boost""]",
"""Necrozma""","""psychic""",,"[""Prism Armor""]",
"""Magearna""","""steel""","""fairy""","[""Soul-Heart""]",
"""Polarizard""",,,"[""Snow warning"", ""Blaze""]","""white"""


In [56]:
# concat has another method, which is align.
# the align method ensures that the columns of the 2 dataframes are in the same order

pokemon_new_order = pl.DataFrame(
    {"name": ["Bulbasaur", "Charmander", "Squirtle"], "color": ["green", "red", "blue"]}
)

pl.concat(
    [
        pokemon.select(
            "name",
            "type1",
        ).head(4),
        pokemon_new_order,
    ],
    how="align",
)

name,type1,color
str,str,str
"""Bulbasaur""","""grass""","""green"""
"""Charmander""","""fire""","""red"""
"""Ivysaur""","""grass""",
"""Squirtle""",,"""blue"""
"""Venusaur""","""grass""",


In [57]:
# join reproduces SQL joins, such as inner, left, outer, semi, anti, cross
# inner join keeps only the rows that are present in both dataframes

pokemon_new = pl.DataFrame(
    {"name": ["Bulbasaur", "Polarizard"], "color": ["green", "white"]}
)

pokemon.head(3).join(pokemon_new, on="name", how="inner")

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""


In [58]:
# left join keeps all rows from the left dataframe and fills the missing values with nulls
# to keep the rows from the right dataframe, we can inverse the order of the dataframes

pokemon.head(3).join(pokemon_new, on="name", how="left", coalesce=True)

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,color
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0,


In [59]:
# full join keeps all rows from both dataframes and fills the missing values with nulls

pokemon.head(3).join(pokemon_new, on="name", how="full")

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,name_right,color
str,str,str,list[str],i64,i64,i64,i64,i64,str,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45.0,49.0,49.0,45.0,0.0,"""Bulbasaur""","""green"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60.0,62.0,63.0,60.0,0.0,,
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80.0,100.0,123.0,80.0,0.0,,
,,,,,,,,,"""Polarizard""","""white"""


In [60]:
# semi keeps the rows from the left dataframe that are present in the right dataframe
# it does not add any columns from the right dataframe (differently from inner)

pokemon.head(3).join(pokemon_new, on="name", how="semi")

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0


In [61]:
# anti keeps the rows present in either dataframe but not in both
# it keeps the opposite rows as inner

pokemon.head(3).join(pokemon_new, on="name", how="anti")

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary
str,str,str,list[str],i64,i64,i64,i64,i64
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0
"""Venusaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",80,100,123,80,0


In [62]:
# cross combines all rows of the first dataframe with all rows of the second dataframe

pokemon_trainers = pl.DataFrame({"trainer": ["trainer1", "trainer2"]})

pokemon.head(2).join(pokemon_trainers, how="cross")

name,type1,type2,abilities,hp,attack,defense,speed,is_legendary,trainer
str,str,str,list[str],i64,i64,i64,i64,i64,str
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer1"""
"""Bulbasaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",45,49,49,45,0,"""trainer2"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer1"""
"""Ivysaur""","""grass""","""poison""","[""Overgrow"", ""Chlorophyll""]",60,62,63,60,0,"""trainer2"""


#### Custom functions: map

In [63]:
# map will be slower than using the native polars functions.
# It's recommended to avoid map whenever possible
# a common use case for map is passing data to a third-party library

In [64]:
# increase the attack by 10% for pokemons with attack < 50
# in the first 2 functions, we receive a column (a list) and process it using a Python list comprehension
# the first function returns a Polars series
# the second function receives a single value and returns a single value

In [65]:
def simulated_attack_series(attack_column):
    return pl.Series(
        [attack * 1.1 if attack < 50 else attack for attack in attack_column]
    )


pokemon.select(
    "name",
    "attack",
    simulated_attack=pl.col("attack").map_batches(simulated_attack_series),
).head(3)

name,attack,simulated_attack
str,i64,f64
"""Bulbasaur""",49,53.9
"""Ivysaur""",62,62.0
"""Venusaur""",100,100.0


In [66]:
def simulated_attack_single_value(attack_number):
    return attack_number * 1.1 if attack_number < 50 else attack_number


pokemon.select(
    "name",
    "attack",
    simulated_attack=pl.col("attack").map_elements(
        simulated_attack_single_value, return_dtype=pl.Int64
    ),
).head(3)

name,attack,simulated_attack
str,i64,i64
"""Bulbasaur""",49,
"""Ivysaur""",62,62.0
"""Venusaur""",100,100.0


In [67]:
# let's check their speed
# we create a bigger dataframe by repeating the original one 100 times
# no need to test the first function, as the result is not what we want

pokemon_100 = pl.concat([pokemon] * 100, rechunk=True)

In [68]:
%%timeit
pokemon_100.select(
    "name",
    "attack",
    simulated_attack=pl.col("attack").map_batches(simulated_attack_series),
)

3.94 ms ± 88.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
%%timeit
pokemon_100.select(
    "name",
    "attack",
    simulated_attack=pl.col("attack").map_elements(simulated_attack_single_value, return_dtype=pl.Int64),
)

5.16 ms ± 64.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [70]:
%%timeit
pokemon_100.select(
    "name",
    "attack",
    simulated_attack=pl.when(pl.col("attack") < 50)
    .then(pl.col("attack") * 1.1)
    .otherwise(pl.col("attack")),
)

313 µs ± 5.38 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [71]:
# map_batches and map_elements are considerably slower than the native Polars functions
# in this case by a factor of 10
# map_batches is faster than apply because we operate on the full column instead of one row at a time

In [72]:
# if we want to apply a function to multiple columns, we can use pl.struct
# to create a struct, then use apply with it

pokemon.head(3).select(
    "name",
    "attack",
    "defense",
    attack_plus_defense=pl.struct("attack", "defense").map_elements(
        lambda columns: columns["attack"] + columns["defense"], return_dtype=pl.Int64
    ),
)

name,attack,defense,attack_plus_defense
str,i64,i64,i64
"""Bulbasaur""",49,49,98
"""Ivysaur""",62,63,125
"""Venusaur""",100,123,223
