In [3]:
import polars as pl

In [4]:
# use fully print tables 
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(-1)

polars.cfg.Config

In [5]:
# 021
df_receipt = pl.read_csv("ref_files/data/receipt.csv")
print(
    df_receipt.select(pl.count())
)

shape: (1, 1)
┌────────┐
│ count  │
│ ---    │
│ u32    │
╞════════╡
│ 104681 │
└────────┘


In [44]:
# 022
print(
    df_receipt.select(pl.col("customer_id").n_unique())
)

shape: (1, 1)
┌─────────────┐
│ customer_id │
│ ---         │
│ u32         │
╞═════════════╡
│ 8307        │
└─────────────┘


In [11]:
# 023
print(
    df_receipt.groupby("store_cd").agg(
        [
            pl.col("amount").sum(),
            pl.col("quantity").sum()
        ]).head(10)
)

shape: (10, 3)
┌──────────┬────────┬──────────┐
│ store_cd ┆ amount ┆ quantity │
│ ---      ┆ ---    ┆ ---      │
│ str      ┆ i64    ┆ i64      │
╞══════════╪════════╪══════════╡
│ S14023   ┆ 727630 ┆ 2258     │
│ S14049   ┆ 230808 ┆ 788      │
│ S13019   ┆ 827833 ┆ 2541     │
│ S13031   ┆ 705968 ┆ 2336     │
│ S13035   ┆ 715869 ┆ 2219     │
│ S14026   ┆ 824537 ┆ 2503     │
│ S14010   ┆ 790361 ┆ 2290     │
│ S14006   ┆ 712839 ┆ 2284     │
│ S13018   ┆ 790535 ┆ 2562     │
│ S12029   ┆ 794741 ┆ 2555     │
└──────────┴────────┴──────────┘


In [22]:
# 024
print(
    df_receipt.groupby("customer_id").agg(
        [
            pl.col("sales_ymd").max(),
        ]).sort("customer_id").head(10)
)

shape: (10, 2)
┌────────────────┬───────────┐
│ customer_id    ┆ sales_ymd │
│ ---            ┆ ---       │
│ str            ┆ i64       │
╞════════════════╪═══════════╡
│ CS001113000004 ┆ 20190308  │
│ CS001114000005 ┆ 20190731  │
│ CS001115000010 ┆ 20190405  │
│ CS001205000004 ┆ 20190625  │
│ CS001205000006 ┆ 20190224  │
│ CS001211000025 ┆ 20190322  │
│ CS001212000027 ┆ 20170127  │
│ CS001212000031 ┆ 20180906  │
│ CS001212000046 ┆ 20170811  │
│ CS001212000070 ┆ 20191018  │
└────────────────┴───────────┘


In [21]:
# 025
print(
    df_receipt.groupby("customer_id").agg(
        [
            pl.col("sales_ymd").min(),
        ]).sort("customer_id").head(10)
)

shape: (10, 2)
┌────────────────┬───────────┐
│ customer_id    ┆ sales_ymd │
│ ---            ┆ ---       │
│ str            ┆ i64       │
╞════════════════╪═══════════╡
│ CS001113000004 ┆ 20190308  │
│ CS001114000005 ┆ 20180503  │
│ CS001115000010 ┆ 20171228  │
│ CS001205000004 ┆ 20170914  │
│ CS001205000006 ┆ 20180207  │
│ CS001211000025 ┆ 20190322  │
│ CS001212000027 ┆ 20170127  │
│ CS001212000031 ┆ 20180906  │
│ CS001212000046 ┆ 20170811  │
│ CS001212000070 ┆ 20191018  │
└────────────────┴───────────┘


In [23]:
# 026
print(
    df_receipt.groupby("customer_id").agg(
        [
            pl.col("sales_ymd").max().alias("latest_sales_ymd"),
            pl.col("sales_ymd").min().alias("oldest_sales_ymd"),
        ]).filter(
            pl.col("latest_sales_ymd") != pl.col("oldest_sales_ymd")
        ).sort("customer_id").head(10)
)

shape: (10, 3)
┌────────────────┬──────────────────┬──────────────────┐
│ customer_id    ┆ latest_sales_ymd ┆ oldest_sales_ymd │
│ ---            ┆ ---              ┆ ---              │
│ str            ┆ i64              ┆ i64              │
╞════════════════╪══════════════════╪══════════════════╡
│ CS001114000005 ┆ 20190731         ┆ 20180503         │
│ CS001115000010 ┆ 20190405         ┆ 20171228         │
│ CS001205000004 ┆ 20190625         ┆ 20170914         │
│ CS001205000006 ┆ 20190224         ┆ 20180207         │
│ CS001214000009 ┆ 20190902         ┆ 20170306         │
│ CS001214000017 ┆ 20191006         ┆ 20180828         │
│ CS001214000048 ┆ 20190929         ┆ 20171109         │
│ CS001214000052 ┆ 20190617         ┆ 20180208         │
│ CS001215000005 ┆ 20181021         ┆ 20170206         │
│ CS001215000040 ┆ 20171022         ┆ 20170214         │
└────────────────┴──────────────────┴──────────────────┘


In [25]:
# 027
print(
    df_receipt.groupby("store_cd").agg(
        [
            pl.col("amount").mean(),
        ]).sort("amount", reverse=True).head(10)
)

shape: (10, 2)
┌──────────┬────────────┐
│ store_cd ┆ amount     │
│ ---      ┆ ---        │
│ str      ┆ f64        │
╞══════════╪════════════╡
│ S13052   ┆ 402.86747  │
│ S13015   ┆ 351.11196  │
│ S13003   ┆ 350.915519 │
│ S14010   ┆ 348.791262 │
│ S13001   ┆ 348.470386 │
│ S13020   ┆ 337.879932 │
│ S14011   ┆ 335.718333 │
│ S14026   ┆ 332.340588 │
│ S13004   ┆ 330.943949 │
│ S13019   ┆ 330.208616 │
└──────────┴────────────┘


In [26]:
# 028
print(
    df_receipt.groupby("store_cd").agg(
        [
            pl.col("amount").median(),
        ]).sort("amount", reverse=True).head(10)
)

shape: (10, 2)
┌──────────┬────────┐
│ store_cd ┆ amount │
│ ---      ┆ ---    │
│ str      ┆ f64    │
╞══════════╪════════╡
│ S13052   ┆ 190.0  │
│ S14010   ┆ 188.0  │
│ S14050   ┆ 185.0  │
│ S13003   ┆ 180.0  │
│ S13018   ┆ 180.0  │
│ S14040   ┆ 180.0  │
│ S14033   ┆ 179.0  │
│ S13008   ┆ 179.0  │
│ S13035   ┆ 178.0  │
│ S14025   ┆ 178.0  │
└──────────┴────────┘


In [41]:
# 029
print(
    df_receipt.groupby("store_cd").agg(
        [
            pl.col("product_cd").mode().first(),
        ]).sort("store_cd").head(10)
)

shape: (10, 2)
┌──────────┬────────────┐
│ store_cd ┆ product_cd │
│ ---      ┆ ---        │
│ str      ┆ str        │
╞══════════╪════════════╡
│ S12007   ┆ P060303001 │
│ S12013   ┆ P060303001 │
│ S12014   ┆ P060303001 │
│ S12029   ┆ P060303001 │
│ S12030   ┆ P060303001 │
│ S13001   ┆ P060303001 │
│ S13002   ┆ P060303001 │
│ S13003   ┆ P071401001 │
│ S13004   ┆ P060303001 │
│ S13005   ┆ P040503001 │
└──────────┴────────────┘


In [43]:
# 030
print(
    df_receipt.groupby("store_cd").agg(
        [
            # 自由度: 0
            pl.col("amount").var(ddof=0),
        ]).sort("amount", reverse=True).head(5)
)

shape: (5, 2)
┌──────────┬───────────────┐
│ store_cd ┆ amount        │
│ ---      ┆ ---           │
│ str      ┆ f64           │
╞══════════╪═══════════════╡
│ S13052   ┆ 440088.701311 │
│ S14011   ┆ 306314.558164 │
│ S14034   ┆ 296920.081011 │
│ S13001   ┆ 295431.993329 │
│ S13015   ┆ 295294.361116 │
└──────────┴───────────────┘
