# Jupyter Notebook: [Improving Code Quality During Data Transformation with Polars.](https://medium.com/data-science/improving-code-quality-during-data-transformation-with-polars-92997e67c8a9)

In [None]:
# 📘 Notebook: Improving Code Quality with Polars Transformations

## 1. Imports & Setup
from dataclasses import dataclass
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed
from typing import List
import polars as pl

seed(42)

## 2. Simulate Input Data
base_time = datetime(2024, 8, 9, 0, 0, 0)

def generate_actions(n: int) -> pl.DataFrame:
    data = [
        {
            "user_id": randrange(10),
            "product_id": choice(["0001", "0002", "0003"]),
            "action_type": "purchase" if gauss(0, 1) > 0.6 else "view",
            "action_dt": base_time - timedelta(minutes=randrange(100_000)),
        }
        for _ in range(n)
    ]
    return pl.DataFrame(data)

def product_catalog() -> pl.DataFrame:
    return pl.DataFrame({"product_id": ["0001", "0002", "0003"], "price": [10, 30, 70]})

user_actions_df = generate_actions(100_000)
product_catalog_df = product_catalog()

## 3. Transformation Pipeline
def filter_previous_day(df: pl.DataFrame, reference: datetime) -> pl.DataFrame:
    yesterday = (reference - timedelta(days=1)).date()
    return df.filter(pl.col("action_dt").dt.date() == yesterday)

def join_price(df: pl.DataFrame, catalog: pl.DataFrame) -> pl.DataFrame:
    return df.join(catalog, on="product_id", how="left")

def compute_user_metrics(df: pl.DataFrame) -> pl.DataFrame:
    grouped = (
        df.groupby("user_id")
          .agg([
            pl.col("price")
              .filter(pl.col("action_type") == "purchase")
              .sum()
              .alias("total_purchase_amount"),
            (pl.col("product_id")
              .filter(pl.col("action_type") == "purchase")
              .count()
             / pl.col("product_id")
              .filter(pl.col("action_type") == "view")
              .count()
            ).alias("purchase_to_view_ratio")
          ])
          .sort("user_id")
    )
    return grouped

def transformation_pipeline(actions: pl.DataFrame, catalog: pl.DataFrame, reference: datetime) -> pl.DataFrame:
    return (
        actions
        .pipe(filter_previous_day, reference)
        .pipe(join_price, catalog)
        .pipe(compute_user_metrics)
    )

result = transformation_pipeline(user_actions_df, product_catalog_df, base_time)
print(result)

## 4. Lazy Version
result_lazy = (
    pl.from_pandas(user_actions_df.to_pandas()).lazy()
      .pipe(lambda lf: lf.filter(pl.col("action_dt").dt.date() == (base_time - timedelta(days=1)).date()))
      .join(pl.from_pandas(product_catalog_df.to_pandas()), on="product_id", how="left")
      .groupby("user_id")
      .agg([
          pl.sum(pl.when(pl.col("action_type") == "purchase").then(pl.col("price")).otherwise(0)).alias("total_purchase_amount"),
          (pl.sum(pl.when(pl.col("action_type") == "purchase").then(1).otherwise(0))
           / pl.sum(pl.when(pl.col("action_type") == "view").then(1).otherwise(0))
          ).alias("purchase_to_view_ratio")
      ])
      .sort("user_id")
      .collect()
)
print(result_lazy)

## 5. Testing (for TDD)
def test_filter_previous_day():
    df = pl.DataFrame({
        "action_dt": [base_time - timedelta(days=1), base_time],
        "user_id": [1, 2],
        "product_id": ["0001", "0002"],
        "action_type": ["view", "purchase"]
    })
    filtered = filter_previous_day(df, base_time)
    assert filtered.height == 1
    assert filtered["user_id"][0] == 1

def test_compute_user_metrics():
    df = pl.DataFrame({
        "user_id": [1, 1, 2],
        "product_id": ["0001","0002","0001"],
        "action_type": ["purchase","view","purchase"],
        "price": [10, 30, 10]
    })
    out = compute_user_metrics(df)
    assert out.filter(pl.col("user_id") == 1)["total_purchase_amount"][0] == 10
    assert out.filter(pl.col("user_id") == 1)["purchase_to_view_ratio"][0] == 1/1

## 6. Summary
print("Result snapshot:")
print(result.head())
