# Jupyter Notebook: [Improving Code Quality During Data Transformation with Polars.](https://medium.com/data-science/improving-code-quality-during-data-transformation-with-polars-92997e67c8a9)

## Setup and Imports.

In [None]:
from dataclasses import dataclass
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed
from typing import Any, Dict

import polars as pl

seed(42)

## Data Preparation and DataFrame Creation.

In [None]:

base_time= datetime(2024, 8, 9, 0, 0, 0, 0)

user_actions_data = [
    {
        "user_id": randrange(10),
        "product_id": choice(["0001", "0002", "0003"]),
        "action_type": ("purchase" if gauss() > 0.6 else "view"),
        "action_dt": base_time - timedelta(minutes=randrange(100_000)),
    }
    for x in range(100_000)
]
user_actions_df = pl.DataFrame(user_actions_data)

product_catalog_data = {"product_id": ["0001", "0002", "0003"], "price": [10, 30, 70]}
product_catalog_df = pl.DataFrame(product_catalog_data)

In [None]:
## Main Logic

In [None]:
@dataclass
class DailyUserPurchaseReport:
    """
    Generates a report containing the total purchase amount and the ratio of purchased items
    to viewed items from the previous day for each user.

    Attributes:
        sources (Dict[str, pl.LazyFrame]): A dictionary containing the data sources, including:
            - 'user_actions': A LazyFrame containing user actions data.
            - 'product_catalog': A LazyFrame containing product catalog data.
        params (Dict[str, Any]): A dictionary containing parameters, including:
            - 'report_date': The date for which the report should be generated (previous day).
    """

    sources: Dict[str, pl.LazyFrame]
    params: Dict[str, Any]

    def _filter_actions_by_date(self, frame: pl.LazyFrame) -> pl.LazyFrame:
        """
         Filters user actions data to include only records from the specified date.

        Args:
            frame (pl.LazyFrame): A LazyFrame containing user actions data.

        Returns:
            pl.LazyFrame: A LazyFrame containing user actions data filtered by the specified date.
        """
        return frame.filter(pl.col("action_dt").dt.date() == self.params["report_date"])

    def _enrich_user_actions_from_product_catalog(
        self, frame: pl.LazyFrame
    ) -> pl.LazyFrame:
        """
        Joins the user actions data with the product catalog to include product prices.

        Args:
            frame (pl.LazyFrame): A LazyFrame containing user actions data.

        Returns:
            pl.LazyFrame: A LazyFrame containing user actions data enriched with product prices.
        """
        return frame.join(self.sources["product_catalog"], on="product_id")

    def _calculate_key_metrics(self, frame: pl.LazyFrame) -> pl.LazyFrame:
        """
        Calculates the total purchase amount and the ratio of purchased items to viewed items.

        Args:
            frame (pl.LazyFrame): A LazyFrame containing enriched user actions data.

        Returns:
            pl.LazyFrame: A LazyFrame containing the total purchase amount and purchase-to-view ratio for each user.
        
        """
        return (
            frame.group_by(pl.col("user_id"))
            .agg(
                [
                    (
                        pl.col("price")
                        .filter(pl.col("action_type") == "purchase")
                        .sum()
                    ).alias("total_purchase_amount"),
                    (
                        pl.col("product_id")
                        .filter(pl.col("action_type") == "purchase")
                        .len()
                        / pl.col("product_id").filter(pl.col("action_type") == "view").len()
                    ).alias("purchase_to_view_ratio"),
                ]
            )
            .sort("user_id")
        )

    def execute(self) -> pl.DataFrame:
        """
        Executes the report generation process.

        This method performs the following steps:
            1. Filters user actions data to include only records from the previous day.
            2. Joins the filtered user actions data with the product catalog.
            3. Calculates the total purchase amount and purchase-to-view ratio for each user.
            4. Returns the final report as a DataFrame.

        Returns:
            pl.DataFrame: A DataFrame containing the total purchase amount and purchase-to-view ratio for each user.
        """
        result: pl.DataFrame = (
            self.sources["user_actions"]
            .pipe(self._filter_actions_by_date)
            .pipe(self._enrich_user_actions_from_product_catalog)
            .pipe(self._calculate_key_metrics)
            .collect()
        )
        return result

## Example:

In [None]:
# prepare sources
user_actions: pl.LazyFrame = user_actions_df.lazy()
product_catalog: pl.LazyFrame = product_catalog_df.lazy()

# get report date
yesterday: datetime = base_time - timedelta(days=1)

# report calculation
df: pl.DataFrame = DailyUserPurchaseReport(
    sources={"user_actions": user_actions, "product_catalog": product_catalog},
    params={"report_date": yesterday},
).execute()
df