## knock 062 数値データを対数変換する(自然対数)

In [1]:
import polars as pl
pl.Config.set_tbl_cols(-1)# 列の表示が省略されないようにする
import polars.selectors as cs# 抽出条件のプリセット

### データを読み込む

In [2]:
df_receipt = pl.read_csv("../docker/work/data/receipt.csv")
display(df_receipt.head())

sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount
i64,i64,str,i64,i64,str,str,i64,i64
20181103,1541203200,"""S14006""",112,1,"""CS006214000001…","""P070305012""",1,158
20181118,1542499200,"""S13008""",1132,2,"""CS008415000097…","""P070701017""",1,81
20170712,1499817600,"""S14028""",1102,1,"""CS028414000014…","""P060101005""",1,170
20190205,1549324800,"""S14042""",1132,1,"""ZZ000000000000…","""P050301001""",1,25
20180821,1534809600,"""S14025""",1102,2,"""CS025415000050…","""P060102007""",1,90


### ノック

In [3]:
(
    df_receipt
    # 使用する列を抽出する(大規模データの場合メモリ使用量を節約する効果がある)
    .select([
        pl.col("customer_id"),
        pl.col("amount")
    ])
    # 非会員は除く
    .filter( ~pl.col("customer_id").str.starts_with("Z") )
    # 顧客ID別に売上高を集計する
    .group_by(by = "customer_id")
    .agg( pl.sum("amount").alias("sum_amount") )
    # 売上高を対数変換する
    .with_columns(
        pl.col("sum_amount")
        .log()
        .alias("log_amound")
    )
    # 先頭10行
    .head(n = 10)
)

customer_id,sum_amount,log_amound
str,i64,f64
"""CS017513000194…",360,5.886104
"""CS028513000128…",135,4.905275
"""CS030614000056…",456,6.122493
"""CS018512000107…",619,6.428105
"""CS028615000056…",1108,7.010312
"""CS002515000256…",2296,7.738924
"""CS032315000063…",376,5.929589
"""CS016315000136…",868,6.766192
"""CS026515000010…",3986,8.290544
"""CS048415000017…",4484,8.408271
