# Jupyter Notebook: [Understanding Polars UDF Capabilities](https://medium.com/@npotapov)

## Setup and Imports

In [None]:
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed
from functools import lru_cache
import pyarrow.compute as pc

import polars as pl
import numpy as np
seed(42)

## Data Preparation and DataFrame Creation

In [None]:
base_time: datetime = datetime(2024, 8, 9, 0, 0, 0, 0)
num_records: int = 1_000_000

user_actions_data: list[dict] = [
    {
        "online_store": choice(["Shop1", "Shop2", "Shop3"]),
        "product_id": choice(["0001", "0002", "0003"]),
        "quantity": choice([1.0, 2.0, 3.0]),
        "action_type": ("purchase" if gauss() > 0.6 else "view"),
        "action_dt": base_time - timedelta(minutes=randrange(num_records)),
    }
    for x in range(num_records)
]
user_actions_df: pl.DataFrame = pl.DataFrame(user_actions_data)

## Using map_elements

In [None]:
%%time

user_actions_df.with_columns(  
    pl.col("quantity")
    .map_elements(lambda x: x ** 2)
    .alias("quantity_2")
).head()

In [None]:
%%time

user_actions_df.with_columns(  
    quantity_2=pl.col("quantity") ** 2
).head()

In [None]:
%%time

(
    user_actions_df
    .group_by("action_type")
    .agg(
        pl.col("quantity")
        .implode() # aggregate all column values into a list
        .map_elements(lambda x: x.sum())
    )
)  

In [None]:
%%time

(
    user_actions_df
    .group_by("action_type")
    .agg(pl.col("quantity").sum())
)  

In [None]:
def udf(action_type: str) -> str:
    return action_type.upper()

user_actions_df.select(pl.col("action_type").map_elements(lambda x: udf(x))).head()

In [None]:
@lru_cache(maxsize=3) # default maxsize=128
def udf2(action_type: str) -> str:
    return action_type.upper()

user_actions_df.select(pl.col("action_type").map_elements(udf2)).head()

In [None]:
udf2.cache_info()

In [None]:
user_actions_df.select(pl.col("quantity").map_elements(np.log)).head()

In [None]:
def udf3(action_type: str, quantity:int) -> float:
    return quantity / 2 if action_type == 'view' else quantity * 2

In [None]:
(
    user_actions_df
    .select(
        pl.struct(["action_type", "quantity"])
        .map_elements(lambda obj: udf3(obj['action_type'], obj['quantity']), return_dtype=pl.Float64)
    )
    .head()
)

## Using map_batches

In [None]:
user_actions_df.with_columns(
    pl.col('quantity').map_batches(
        lambda x: x.to_numpy().max(),
        returns_scalar=True,
    )
).head()

In [None]:
def udf4(input):
    print(type(input))
    return input.slice(1,2)

In [None]:
(  
    user_actions_df
    .lazy()
    .map_batches(lambda x: udf4(x), streamable=True)
    .collect()
)

In [None]:
def udf5(series):
    mean = series.mean()
    return pl.Series([value - mean for value in series])

user_actions_df.select(pl.col("quantity").map_batches(udf5, return_dtype=pl.Float64)).head()

In [None]:
user_actions_df.select(quantity=pl.col("quantity") - pl.col("quantity").mean()).head()

In [None]:
(
    user_actions_df.with_columns(
        pl.col('online_store').map_batches(
            lambda text: pl.from_arrow(
                pc.replace_substring_regex(
                    text.to_arrow(), pattern=r"[\p{L}]", replacement="*"
                )
            ),
            return_dtype=pl.Utf8,
        )
    ).head()
)

## Using map_columns

In [None]:
user_actions_df.schema

In [None]:
# shrink_dtype - Shrink numeric columns to the minimal required datatype.
user_actions_df.map_columns('quantity', lambda x: x.shrink_dtype()).schema

## Using map_rows

In [None]:
user_actions_df.map_rows(lambda row: (row[0] + row[1], row[2])).head()

## Using map_groups

In [None]:
def udf6(group_df:pl.DataFrame) -> pl.DataFrame:
    group_df.glimpse(max_items_per_column=1)
    return group_df.max()

In [None]:
user_actions_df.sort("online_store").group_by("online_store").map_groups(udf6).sort('online_store') 

In [None]:
user_actions_df.sort('online_store').group_by("online_store").agg(pl.all().max())

In [None]:
def udf7(group_df: pl.DataFrame) -> pl.DataFrame:
    q_min = group_df["quantity"].min()
    q_max = group_df["quantity"].max()
    return group_df.with_columns(
        ((pl.col("quantity") - q_min) / (q_max - q_min)).alias("quantity_norm")
    )

user_actions_df.group_by("online_store").map_groups(udf7).head()