# Jupyter Notebook: [8 Practical Polars Functions Every Data Professional Should Have in Their Toolkit](https://medium.com/@npotapov)

## Setup and Imports

In [None]:
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed

import polars as pl
import numpy as np
seed(42)

## Data Preparation and DataFrame Creation

In [None]:
base_time: datetime = datetime(2024, 8, 9, 0, 0, 0, 0)
num_records: int = 1_000_000

user_actions_data: list[dict] = [
    {
        "online_store": choice(["Shop1", "Shop2", "Shop3"]),
        "product_id": choice(["0001", "0002", "0003"]),
        "quantity": choice([1.0, 2.0, 3.0]),
        "action_type": ("purchase" if gauss() > 0.6 else "view"),
        "action_dt": base_time - timedelta(minutes=randrange(num_records)),
    }
    for x in range(num_records)
]
user_actions_df: pl.DataFrame = pl.DataFrame(user_actions_data)

## 1. Using value_counts

In [None]:
user_actions_df.select(pl.col("online_store")).to_series().value_counts()

In [None]:
user_actions_df.select(pl.col("online_store")).to_series().value_counts(sort=True, normalize=True)

## 2. Using rolling

In [None]:
df = user_actions_df.sort("action_dt").rolling(index_column="action_dt", period="1h").agg(
    sum_quantity = pl.sum("quantity"),
    agg_quantity = pl.col("quantity"),
    agg_action_dt = pl.col("action_dt"),
).head()

In [None]:
with pl.Config(fmt_table_cell_list_len=10, fmt_str_lengths=50):
    print(df)

## 3. Using with_row_index

In [None]:
user_actions_df.with_row_index("id").head()

## 4. Using lazy

In [None]:
lf = (
    user_actions_df
    .lazy()
    .with_columns(quantity = pl.col("quantity")**2)
    .filter(pl.col("action_type") == "purchase")
    .select("online_store", "action_type", "quantity")
    .head()
)

In [None]:
lf.show_graph(optimized=False)

In [None]:
lf.show_graph()

In [None]:
print(lf.explain())

In [None]:
lf.collect()

## 5. Using pipe

In [None]:
def step1(frame: pl.LazyFrame) -> pl.LazyFrame:
    return frame.filter(pl.col("online_store") == "Shop1")

def step2(frame: pl.LazyFrame) -> pl.LazyFrame:
    return frame.with_columns(sum_quantity=pl.col("quantity").sum())

def step3(frame: pl.LazyFrame) -> pl.LazyFrame:
    return frame.head()

In [None]:
(
    user_actions_df
    .lazy()
    .pipe(step1)
    .pipe(step2)
    .pipe(step3)
    .collect()
)

## 6. Using map_elements

In [None]:
user_actions_df.select(pl.col("quantity").map_elements(np.log)).head()

## 7. Using iter_slices

In [None]:
for idx, frame in enumerate(user_actions_df.iter_slices(n_rows=100_000)):
    print(f"{type(frame).__name__}[{idx}]: {len(frame)}")

## 8. Using vstack

In [None]:
buffer: pl.DataFrame = pl.DataFrame()

for name, chunk in user_actions_df.group_by("online_store"):
    buffer.vstack(chunk,in_place=True)
    print(f">>> {name[0]} <<<")
    chunk.glimpse(max_items_per_column=1)

print(f">>> Result <<<")
buffer.glimpse(max_items_per_column=3)
# n_chunks - get number of chunks used by the ChunkedArrays of this DataFrame: 
print(f"Before rechunk(): {buffer.n_chunks(strategy='all')}")
buffer = buffer.rechunk()
print(f"After rechunk(): {buffer.n_chunks(strategy='all')}")