# Jupyter Notebook: [Practical Introduction to Polars.](https://medium.com/data-science/practical-introduction-to-polars-8d9cdca350f1)

## Setup and Imports.

In [None]:
from dataclasses import dataclass
from datetime import datetime, timedelta
from random import choice, gauss, randrange, seed
from typing import Any, Dict

import polars as pl
import pandas as pd

seed(42)

## 1. Data Preparation and DataFrame Creation.

In [None]:
base_time= datetime(2024, 8, 31, 0, 0, 0, 0)

user_actions_data = [
    {
        "OnlineStore": choice(["Shop1", "Shop2", "Shop3"]),
        "product": choice(["0001", "0002", "0003"]),
        "quantity": choice([1.0, 2.0, 3.0]),
        "Action type": ("purchase" if gauss() > 0.6 else "view"),
        "Action_time": base_time - timedelta(minutes=randrange(1_000_000)),
    }
    for x in range(1_000_000)
]

corrupted_data = [
    {
        "OnlineStore": choice(["Shop1", "Shop2", "Shop3"]),
        "product": choice(["0001", None]),
        "quantity": choice([1.0, None]),
        "Action type": ("purchase" if gauss() > 0.6 else "view"),
        "Action_time": base_time - timedelta(minutes=randrange(1_000)),
    }
    for x in range(1_000)
]

product_catalog_data = {"product_id": ["0001", "0002", "0003"], "price": [100, 25, 80]}

In [None]:
# Pandas
user_actions_pd_df = pd.DataFrame(user_actions_data)
corrupted_pd_df = pd.DataFrame(corrupted_data)
product_catalog_pd_df = pd.DataFrame(product_catalog_data)

# Polars
user_actions_pl_df = pl.DataFrame(user_actions_data)
corrupted_pl_df = pl.DataFrame(corrupted_data)
product_catalog_pl_df = pl.DataFrame(product_catalog_data)

In [None]:
# Pandas
user_actions_pd_df = pd.concat([user_actions_pd_df, corrupted_pd_df])

# Polars
user_actions_pl_df = pl.concat([user_actions_pl_df, corrupted_pl_df])

## 2. Summary Statistics of the DataFrame.

In [None]:
# Pandas
user_actions_pd_df.describe(include='all')

In [None]:
# Polars
user_actions_pl_df.describe()

## 3. Retrieving the First Five Records.

In [None]:
# Pandas
user_actions_pd_df.head()

In [None]:
# Polars
user_actions_pl_df.head()

In [None]:
# Polars
user_actions_pl_df.glimpse(max_items_per_column=5)

## 4. Renaming Columns.

In [None]:
# Pandas
user_actions_pd_df = user_actions_pd_df.rename(
    columns={
        "OnlineStore": "online_store",
        "product": "product_id",
        "Action type": "action_type",
        "Action_time": "action_dt",
    }
)
user_actions_pd_df.columns

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.rename(
    {
        "OnlineStore": "online_store",
        "product": "product_id",
        "Action type": "action_type",
        "Action_time": "action_dt",
    }
)
user_actions_pl_df.columns

## 5. Changing Column types.

In [None]:
# Pandas
user_actions_pd_df = user_actions_pd_df.astype({"quantity": "Int64"})

In [None]:
user_actions_pd_df.info()

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.cast({"quantity": pl.Int32})

In [None]:
user_actions_pl_df.glimpse(max_items_per_column=0)

In [None]:
user_actions_pl_df.estimated_size("mb")

## 6. Filling Missing Values.

In [None]:
# Pandas
user_actions_pd_df["quantity"].fillna(0, inplace=True)

In [None]:
user_actions_pd_df.info()

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.with_columns(pl.col("quantity").fill_null(0))

In [None]:
user_actions_pl_df.describe().filter(pl.col("statistic").is_in(["count", "null_count"]))

## 7. Removing Missing Values.

In [None]:
# Pandas
user_actions_pd_df.dropna(subset=["product_id"], inplace=True)

In [None]:
user_actions_pd_df.info()

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.drop_nulls(subset=["product_id"])

In [None]:
user_actions_pl_df.describe().filter(pl.col("statistic").is_in(["count", "null_count"]))

## 8. Removing Duplicate Records.

In [None]:
# Pandas
user_actions_pd_df.drop_duplicates(
    subset=["online_store", "action_type", "action_dt"],
    keep="last",
    inplace=True,
)

In [None]:
user_actions_pd_df.info()

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.unique(
    subset=["online_store", "action_type", "action_dt"],
    keep="last",
)

In [None]:
user_actions_pl_df.describe().filter(pl.col("statistic").is_in(["count", "null_count"]))

## 9. Filtering Data.

In [None]:
# Pandas
user_actions_pd_df = user_actions_pd_df.loc[
    user_actions_pd_df["action_type"] == "purchase"
]

In [None]:
user_actions_pd_df.info()

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.filter(
    pl.col("action_type") == "purchase"
)

In [None]:
user_actions_pl_df.describe().filter(pl.col("statistic").is_in(["count", "null_count"]))

## 10. Selecting Required Columns.

In [None]:
# Pandas
user_actions_pd_df = user_actions_pd_df[
    ["online_store", "action_type", "product_id", "quantity"]
]

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.select(
    "online_store", "action_type", "product_id", "quantity"
)

## 11. Grouping Data.

In [None]:
# Pandas
user_actions_pd_df = (
    user_actions_pd_df.groupby(["online_store", "product_id", "action_type"])
    .agg({"quantity": "sum"})
    .reset_index()
)
user_actions_pd_df.sort_values(by=["online_store", "product_id"])

In [None]:
# Polars
user_actions_pl_df = (
    user_actions_pl_df.group_by(["online_store", "product_id", "action_type"])
    .agg(pl.col("quantity").sum())
)
user_actions_pl_df.sort(["online_store", "product_id"])

## 12. Joining Data with Another DataFrame.

In [None]:
# Pandas
user_actions_pd_df = user_actions_pd_df.merge(product_catalog_pd_df, on='product_id')
# Display result
user_actions_pd_df.sort_values(by=["online_store", "product_id"])

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.join(product_catalog_pl_df, on='product_id')
# Display result
user_actions_pl_df.sort(["online_store", "product_id"])

## 13. Calculating a New Column.

In [None]:
# Pandas
user_actions_pd_df["total"] = (
    user_actions_pd_df["price"] * user_actions_pd_df["quantity"]
)
user_actions_pd_df = user_actions_pd_df[
    ["online_store", "action_type", "total"]
]

In [None]:
# Polars
user_actions_pl_df = user_actions_pl_df.with_columns(
    (pl.col("price") * pl.col("quantity")).alias("total")
)
user_actions_pl_df = user_actions_pl_df.select(
    "online_store", "action_type", "total"
)

In [None]:
# Alternatively, you can calculate a new column 
# directly within the select() method:
# user_actions_pl_df = user_actions_pl_df.select(
#     "online_store",
#     "action_type",
#     (pl.col("price") * pl.col("quantity")).alias("total"),
# )

## 14. Creating a Pivot Table.

In [None]:
# Pandas
result_pd = user_actions_pd_df.pivot_table(
    columns="online_store",
    index="action_type",
    values="total",
    aggfunc="sum",
)
result_pd

In [None]:
# Polars
result_pl = user_actions_pl_df.pivot(
    columns="online_store",
    index="action_type",
    values="total",
    aggregate_function="sum",
    sort_columns=True
)
result_pl