In [174]:
# Google Driveと接続を行います。これを行うことで、Driveにあるデータにアクセスできるようになります。
# 下記セルを実行すると、Googleアカウントのログインを求められますのでログインしてください。
#from google.colab import drive
#drive.mount('/content/drive')

In [175]:
# 作業フォルダへの移動を行います。
# 人によって作業場所がことなるので、その場合作業場所を変更してください。
#import os 
#os.chdir('/content/drive/MyDrive/100knock-data_analytics/2章') #ここを変更。

# ２章　小売店のデータでデータ加工を行う１０本ノック

本章では、ある小売店の売上履歴と顧客台帳データを用いて、データ分析の素地となる「データの加工」を習得することが目的です。
実際の現場データは手入力のExcel等、決して綺麗なデータではない事が多いため、
データの揺れや整合性の担保など、汚いデータを取り扱うデータ加工を主体に進めて行きます。

### ノック１１：データを読み込んでみよう

In [176]:
import polars as pl
# 列表示を省略しない
pl.Config.set_tbl_cols(-1)
# 行の表示を省略しない
pl.Config.set_tbl_rows(-1)

polars.config.Config

In [177]:
uriage_data = pl.read_csv("uriage.csv")
uriage_data.head()

purchase_date,item_name,item_price,customer_name
str,str,i64,str
"""2019-06-13 18:…","""商品A""",100.0,"""深井菜々美"""
"""2019-07-13 13:…","""商 品 S""",,"""浅田賢二"""
"""2019-05-11 19:…","""商 品 a""",,"""南部慶二"""
"""2019-02-12 23:…","""商品Z""",2600.0,"""麻生莉緒"""
"""2019-04-22 03:…","""商品a""",,"""平田鉄二"""


In [178]:
kokyaku_data = pl.read_excel("kokyaku_daicho.xlsx")
kokyaku_data.head()

顧客名,かな,地域,メールアドレス,登録日
str,str,str,str,str
"""須賀ひとみ""","""すが ひとみ""","""H市""","""suga_hitomi@ex…","""2018/01/04"""
"""岡田 敏也""","""おかだ としや""","""E市""","""okada_toshiya@…","""42782"""
"""芳賀 希""","""はが のぞみ""","""A市""","""haga_nozomi@ex…","""2018/01/07"""
"""荻野 愛""","""おぎの あい""","""F市""","""ogino_ai@examp…","""42872"""
"""栗田 憲一""","""くりた けんいち""","""E市""","""kurita_kenichi…","""43127"""


### ノック１２：データの揺れを見てみよう

In [179]:
uriage_data["item_name"].head()

item_name
str
"""商品A"""
"""商 品 S"""
"""商 品 a"""
"""商品Z"""
"""商品a"""
"""商品S"""
"""商品 a"""
"""商品V"""
"""商品O"""
"""商品A"""


In [180]:
uriage_data["item_price"].head()

item_price
i64
100.0
""
""
2600.0
""
1900.0
""
2200.0
1500.0
100.0


### ノック１３：データに揺れがあるまま集計しよう

In [181]:
uriage_data = (
    uriage_data
    .replace("purchase_date", uriage_data["purchase_date"].str.strptime(dtype = pl.Datetime, format = "%Y-%m-%d %H:%M:%S"))
    .with_columns([uriage_data["purchase_date"].dt.strftime("%Y%m").alias("purchase_month")])
)

import polars.selectors as cs
res = uriage_data.pivot(values = cs.numeric(), index = "purchase_month", columns = "item_name", aggregate_function = "count")
res

    df = df.with_columns(new_column.alias(column_name))
instead.
  .replace("purchase_date", uriage_data["purchase_date"].str.strptime(dtype = pl.Datetime, format = "%Y-%m-%d %H:%M:%S"))


purchase_month,商品A,商 品 S,商 品 a,商品Z,商品a,商品S,商品 a,商品V,商品O,商 品U,商品L,商 品V,商 品O,商品C,商品I,商品r,商品X,商品 g,商品R,商品P,商品Q,商品y,商品 A,商品N,商品W,商 品E,商品K,商品B,商品F,商 品s,商品W,商 品 n,商 品F,商品D,商品M,商品Y,商品U,商品H,商品T,商品J,商 品O,商品E,商 品Q,商品S,商品M,商 品T,商品G,商 品G,商品P,商品E,商 品N,商 品Y,商品 J,商品 V,商品 K,商 品V,商 品D,商 品A,商品 F,商品 H,商 品K,商 品T,商品 X,商品 Q,商 品X,商 品H,商 品C,商品 B,商品 O,商品 T,商品v,商品p,商品i,商品 w,商 品 s,商 品 q,商品s,商品l,商品t,商品k,商品g,商品o,商品 R,商品 S,商 品M,商品j,商品d,商品 I,商品 E,商品 o,商品c,商品 v,商品e,商品x,商 品I,商品W,商品X,商品 M,商 品P
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32.1,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32.1,u32,u32,u32.1,u32.1,u32,u32,u32,u32.1,u32.1,u32,u32,u32,u32,u32,u32.1,u32,u32,u32,u32,u32,u32.1,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32.2,u32.1,u32,u32
"""201906""",23,,,,,21,,19,14,1.0,15,,1.0,10,18,,14,1.0,16,17,15,,1.0,16,16,,15,12,18,,,,,19,17,13,17,12,12,22,,13,,,,,13,1.0,,,,,,1.0,,,,,,1.0,,,,,,,1.0,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,
"""201907""",20,1.0,,,,22,,26,19,,17,,1.0,17,18,2.0,18,,17,26,14,,,15,16,1.0,12,20,17,,,,,17,11,12,13,18,20,23,1.0,10,,,,,19,,,,,,,1.0,,1.0,,,,,,,,,,1.0,,,,1.0,,,1.0,,,1.0,,,,,,1.0,,,,,,,1.0,,,,,,,,,,
"""201905""",20,,2.0,,,16,1.0,8,18,,12,,,15,16,,16,,22,20,12,1.0,,19,20,,16,14,17,,,,,14,17,19,16,15,15,10,,19,,1.0,,,23,,,,,,1.0,,1.0,1.0,,1.0,1.0,,1.0,,,1.0,,,,,,1.0,,,,1.0,1.0,,,1.0,,,,,,,1.0,,,,,,1.0,,,,,,,,
"""201902""",19,,,1.0,,21,,21,18,,11,,,26,11,,14,1.0,21,19,22,,,19,24,,16,13,14,,,,,20,15,11,19,17,22,13,,16,,,,,13,,,,1.0,,,,,,1.0,,,,,,1.0,,,,,1.0,,,1.0,,,,,,1.0,,1.0,,,,1.0,,,1.0,,,,1.0,,,,,1.0,,1.0,,
"""201904""",16,,,,1.0,15,,14,14,,15,1.0,1.0,24,18,,16,,20,15,19,,,20,13,,14,19,15,,1.0,,,20,11,17,11,11,15,13,,17,1.0,,,1.0,13,,,,,3.0,,,,,,,2.0,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,,,,,,,,,,1.0,,,,,,
"""201903""",17,,,,,20,,17,11,,20,,,20,12,,21,,16,20,23,,,13,16,,14,21,26,,,,1.0,16,21,16,23,18,12,15,,8,,,1.0,,14,,1.0,1.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,1.0,,,,,,,
"""201901""",18,,,,,18,,21,21,,18,,1.0,18,17,,12,,21,15,17,,,15,13,,19,13,15,1.0,,1.0,,17,18,10,7,15,16,17,,18,,,,,11,,,,,,,1.0,,,,,,,,1.0,,,2.0,1.0,,,1.0,,,,,,,,,1.0,,1.0,,1.0,,1.0,,,,1.0,,,1.0,,,,,,,1.0,1.0


In [182]:
res = uriage_data.pivot(values = "item_price", index = "purchase_month", columns = "item_name", aggregate_function = "sum")
res

purchase_month,商品A,商 品 S,商 品 a,商品Z,商品a,商品S,商品 a,商品V,商品O,商 品U,商品L,商 品V,商 品O,商品C,商品I,商品r,商品X,商品 g,商品R,商品P,商品Q,商品y,商品 A,商品N,商品W,商 品E,商品K,商品B,商品F,商 品s,商品W,商 品 n,商 品F,商品D,商品M,商品Y,商品U,商品H,商品T,商品J,商 品O,商品E,商 品Q,商品S,商品M,商 品T,商品G,商 品G,商品P,商品E,商 品N,商 品Y,商品 J,商品 V,商品 K,商 品V,商 品D,商 品A,商品 F,商品 H,商 品K,商 品T,商品 X,商品 Q,商 品X,商 品H,商 品C,商品 B,商品 O,商品 T,商品v,商品p,商品i,商品 w,商 品 s,商 品 q,商品s,商品l,商品t,商品k,商品g,商品o,商品 R,商品 S,商 品M,商品j,商品d,商品 I,商品 E,商品 o,商品c,商品 v,商品e,商品x,商 品I,商品W,商品X,商品 M,商 品P
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64.1,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64.1,i64,i64,i64.1,i64.1,i64,i64,i64,i64.1,i64.1,i64,i64,i64,i64,i64,i64.1,i64,i64,i64,i64,i64,i64.1,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64.2,i64.1,i64,i64
"""201906""",2000,,,,,34200,,33000,19500,2100.0,14400,,1500.0,2400,12600,,24000,700.0,27000,24000,23800,,100.0,15400,25300,,15400,2400,9600,,,,,7200,19500,30000,29400,8800,22000,20000,,6500,,,,,7700,700.0,,,,,,0.0,,,,,,800.0,,,,,,,300.0,,,,,1600.0,900.0,,,,,,,,,,,,,,,,,,,,,2400.0,,2300.0,,,
"""201907""",1600,0.0,,,,38000,,52800,25500,,20400,,1500.0,4500,13500,1800.0,38400,,25200,38400,18700,,,18200,32200,500.0,9900,3600,9600,,,,,6000,13000,27500,23100,12800,34000,17000,0.0,4500,,,,,11900,,,,,,,2200.0,,2200.0,,,,,,,,,,800.0,,,,2000.0,,,900.0,,,1700.0,,,,,,1500.0,,,,,,,500.0,,,,,,,,,,
"""201905""",1900,,100.0,,,20900,0.0,8800,24000,,12000,,,3900,13500,,31200,,32400,28800,15300,2500.0,,23800,36800,,15400,2600,9000,,,,,5200,20800,45000,33600,12000,26000,8000,,9000,,1900.0,,,14000,,,,,,1000.0,,1100.0,2200.0,,100.0,600.0,,1100.0,,,1700.0,,,,,,2000.0,,,,2300.0,1900.0,,,1200.0,,,,,,,1300.0,,,,,,300.0,,,,,,,,
"""201902""",1700,,,2600.0,,32300,,39600,25500,,13200,,,6000,8100,,31200,700.0,32400,30400,30600,,,23800,43700,,14300,2400,6600,,,,,7600,16900,20000,37800,12800,40000,11000,,5500,,,,,7000,,,,1400.0,,,,,,400.0,,,,,,2400.0,,,,,0.0,,,2200.0,,,,,,1900.0,,2000.0,,,,1800.0,,,1000.0,,,,1500.0,,,,,900.0,,2400.0,,
"""201904""",1400,,,,0.0,28500,,19800,18000,,15600,2200.0,1500.0,6000,14400,,38400,,30600,22400,28900,,,22400,20700,,12100,2200,7800,,2300.0,,,7200,14300,32500,16800,7200,28000,13000,,8000,1700.0,,,2000.0,9100,,,,,7500.0,,,,,,,1200.0,,,,,,,,,,,,,,,,,,1900.0,,,,700.0,,,,,,,,,,,,500.0,,,,,,
"""201903""",1300,,,,,26600,,35200,15000,,21600,,,5100,10800,,38400,,27000,32000,34000,,,16800,34500,,14300,4200,11400,,,,600.0,6000,26000,35000,46200,14400,22000,13000,,4000,,,1300.0,,7700,,1600.0,500.0,,,1000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,400.0,,,,,0.0,,,,,,,
"""201901""",1500,,,,,28500,,41800,27000,,19200,,1500.0,5100,13500,,24000,,36000,20800,25500,,,21000,27600,,16500,1600,7800,1900.0,,1400.0,,6000,19500,20000,12600,10400,28000,16000,,6500,,,,,7000,,,,,,,2200.0,,,,,,,,2000.0,,,4800.0,800.0,,,1500.0,,,,,,,,,1200.0,,1100.0,,1500.0,,1900.0,,,,900.0,,,300.0,,,,,,,1300.0,1600.0


### ノック１４：商品名の揺れを補正しよう

In [183]:
print( len( uriage_data["item_name"].unique() ) )

99


In [184]:
uriage_data["item_name"].unique().sort().head()
# 商品名の揺れ
# 半角スペース "  商品W"
# 半角スペース 2個連続 "商  品O"
# 小文字のアルファベット " 商 品 n"

item_name
str
""" 商品W"""
""" 商 品 n"""
""" 商品E"""
""" 商品M"""
""" 商品P"""
""" 商品S"""
""" 商品W"""
""" 商品X"""
"""商 品O"""
"""商 品Q"""


In [185]:
uriage_data = (
    uriage_data
    .replace("item_name",
             uriage_data["item_name"]
             .str.to_uppercase()
             .str.replace_all("　", "")
             .str.replace_all(" ", ""))
)

uriage_data.sort(by = ["item_name"]).head(10)

    df = df.with_columns(new_column.alias(column_name))
instead.
  .replace("item_name",


purchase_date,item_name,item_price,customer_name,purchase_month
datetime[μs],str,i64,str,str
2019-06-13 18:02:34,"""商品A""",100.0,"""深井菜々美""","""201906"""
2019-05-11 19:42:07,"""商品A""",,"""南部慶二""","""201905"""
2019-04-22 03:09:35,"""商品A""",,"""平田鉄二""","""201904"""
2019-05-18 19:16:53,"""商品A""",,"""深井照生""","""201905"""
2019-01-28 10:47:03,"""商品A""",100.0,"""大地礼子""","""201901"""
2019-06-11 12:57:24,"""商品A""",,"""黄川田博之""","""201906"""
2019-06-29 10:01:52,"""商品A""",100.0,"""石田佑""","""201906"""
2019-01-12 03:26:36,"""商品A""",100.0,"""小平陽子""","""201901"""
2019-02-11 17:00:37,"""商品A""",100.0,"""原口俊二""","""201902"""
2019-06-14 09:08:17,"""商品A""",100.0,"""深井菜々美""","""201906"""


### ノック１５：金額欠損値の補完をしよう

In [186]:
uriage_data.select(pl.all().null_count())

purchase_date,item_name,item_price,customer_name,purchase_month
u32,u32,u32,u32,u32
0,0,387,0,0


In [187]:
# 製品価格の欠損値を埋めるために、{製品名: 製品価格}の辞書型を作成する
uriage_drop_null = uriage_data[["item_name", "item_price"]]\
    .sort(by = ["item_price"], descending = True)\
        .drop_nulls(subset = ["item_price"])\
            .unique(subset = ["item_name"])

uriage_dict = {}
for row in range(len(uriage_drop_null)):
    uriage_dict[uriage_drop_null[row, 0]] = uriage_drop_null[row, 1]
uriage_dict

del uriage_drop_null

In [188]:
uriage_drop_null = uriage_data[["item_name", "item_price"]]\
    .sort(by = ["item_price"], descending = True)\
        .drop_nulls(subset = ["item_price"])\
            .unique(subset = ["item_name"])
uriage_dict = {}
for row in range(len(uriage_drop_null)):
    uriage_dict[uriage_drop_null[row, 0]] = uriage_drop_null[row, 1]
uriage_dict

{'商品L': 1200,
 '商品V': 2200,
 '商品U': 2100,
 '商品Y': 2500,
 '商品G': 700,
 '商品Q': 1700,
 '商品B': 200,
 '商品J': 1000,
 '商品R': 1800,
 '商品X': 2400,
 '商品W': 2300,
 '商品D': 400,
 '商品Z': 2600,
 '商品O': 1500,
 '商品S': 1900,
 '商品N': 1400,
 '商品P': 1600,
 '商品A': 100,
 '商品H': 800,
 '商品C': 300,
 '商品E': 500,
 '商品K': 1100,
 '商品I': 900,
 '商品F': 600,
 '商品T': 2000,
 '商品M': 1300}

In [189]:
for trg in uriage_data["item_name"].sort().unique().to_list():
    (
        print(trg
              + "の最大額:" + str( uriage_data.filter( pl.col("item_name") == trg )["item_price"].max() )
              + "\tの最小額: " + str(uriage_data.filter( pl.col("item_name") == trg )["item_price"].min()) )
    )

商品Uの最大額:2100	の最小額: 2100
商品Jの最大額:1000	の最小額: 1000
商品Lの最大額:1200	の最小額: 1200
商品Kの最大額:1100	の最小額: 1100
商品Wの最大額:2300	の最小額: 2300
商品Oの最大額:1500	の最小額: 1500
商品Mの最大額:1300	の最小額: 1300
商品Zの最大額:2600	の最小額: 2600
商品Yの最大額:2500	の最小額: 2500
商品Aの最大額:100	の最小額: 100
商品Cの最大額:300	の最小額: 300
商品Sの最大額:1900	の最小額: 1900
商品Dの最大額:400	の最小額: 400
商品Pの最大額:1600	の最小額: 1600
商品Tの最大額:2000	の最小額: 2000
商品Fの最大額:600	の最小額: 600
商品Iの最大額:900	の最小額: 900
商品Nの最大額:1400	の最小額: 1400
商品Eの最大額:500	の最小額: 500
商品Gの最大額:700	の最小額: 700
商品Xの最大額:2400	の最小額: 2400
商品Rの最大額:1800	の最小額: 1800
商品Bの最大額:200	の最小額: 200
商品Vの最大額:2200	の最小額: 2200
商品Hの最大額:800	の最小額: 800
商品Qの最大額:1700	の最小額: 1700


### ノック１６：顧客名の揺れを補正しよう

In [190]:
kokyaku_data["顧客名"].head()

顧客名
str
"""須賀ひとみ"""
"""岡田 敏也"""
"""芳賀 希"""
"""荻野 愛"""
"""栗田 憲一"""
"""梅沢 麻緒"""
"""相原 ひとり"""
"""新村 丈史"""
"""石川 まさみ"""
"""小栗 正義"""


In [191]:
uriage_data["customer_name"].head()

customer_name
str
"""深井菜々美"""
"""浅田賢二"""
"""南部慶二"""
"""麻生莉緒"""
"""平田鉄二"""
"""堀江佑"""
"""深井照生"""
"""牧田玲那"""
"""堀北雅彦"""
"""大地礼子"""


In [192]:
#DeprecationWarning: `replace` is deprecated. DataFrame.replace is deprecated and will be removed in a future version. Please use
#    df = df.with_columns(new_column.alias(column_name))
#kokyaku_data = (
#    kokyaku_data.replace("顧客名",
#                         kokyaku_data["顧客名"].str.replace_all("　", "").str.replace_all(" ", ""))
#)

kokyaku_data = (
    kokyaku_data.with_columns(
        kokyaku_data["顧客名"].str.replace_all("　", "").str.replace_all(" ", "").alias("顧客名"))
)
kokyaku_data["顧客名"].head(n = 5)

顧客名
str
"""須賀ひとみ"""
"""岡田敏也"""
"""芳賀希"""
"""荻野愛"""
"""栗田憲一"""


### ノック１７：日付の揺れを補正しよう

In [193]:
flg_is_serial = kokyaku_data["登録日"].cast(pl.Int64, strict = False)
len( flg_is_serial.drop_nulls() )

22

In [194]:
#flg_is_serial = kokyaku_data["登録日"].astype("str").str.isdigit()
#flg_is_serial.sum()

In [195]:
#fromSerial = pd.to_timedelta(kokyaku_data.loc[flg_is_serial, "登録日"].astype("float") - 2, unit="D") + pd.to_datetime('1900/1/1')
#fromSerial

In [196]:
#fromString = pd.to_datetime(kokyaku_data.loc[~flg_is_serial, "登録日"])
#fromString

In [197]:
#kokyaku_data["登録日"] = pd.concat([fromSerial, fromString])
#kokyaku_data

In [198]:
#kokyaku_data["登録年月"] = kokyaku_data["登録日"].dt.strftime("%Y%m")
#rslt = kokyaku_data.groupby("登録年月").count()["顧客名"]
#print(rslt)
#print(len(kokyaku_data))

In [199]:
from datetime import datetime, timedelta

# Excelの日付形式をPythonのdatetime型に変換する関数を定義します。
def convert_excel_date_to_datetime(excel_date):
    return datetime(1899, 12, 30) + timedelta(days=int(excel_date))

# 日付と数字が混在したSeriesを、Excel書式の日付にに変換する
converted_date = (
    kokyaku_data["登録日"].cast(pl.Utf8)
    .map_elements(lambda x: convert_excel_date_to_datetime(x)
                  if x.isdigit() else datetime.strptime(x, "%Y/%m/%d"), return_dtype = pl.Date)
)

converted_date.head(n = 5)

kokyaku_data = (
    kokyaku_data.with_columns(
        converted_date.alias("登録日")
    )
)

In [200]:
kokyaku_data = (
    kokyaku_data.with_columns(
        converted_date.dt.strftime("%Y%m").alias("登録年月")
    )
)

rslt = kokyaku_data.group_by(by = "登録年月").agg( pl.col("顧客名").count() ).sort(by = "登録年月")
print(rslt)
print(len(kokyaku_data))

shape: (15, 2)
┌──────────┬────────┐
│ 登録年月 ┆ 顧客名 │
│ ---      ┆ ---    │
│ str      ┆ u32    │
╞══════════╪════════╡
│ 201701   ┆ 15     │
│ 201702   ┆ 11     │
│ 201703   ┆ 14     │
│ 201704   ┆ 15     │
│ 201705   ┆ 14     │
│ 201706   ┆ 13     │
│ 201707   ┆ 17     │
│ 201801   ┆ 13     │
│ 201802   ┆ 15     │
│ 201803   ┆ 17     │
│ 201804   ┆ 5      │
│ 201805   ┆ 19     │
│ 201806   ┆ 13     │
│ 201807   ┆ 17     │
│ 201904   ┆ 2      │
└──────────┴────────┘
200


In [201]:
flg_is_serial = kokyaku_data["登録日"].cast(pl.Int64, strict = False)
len( flg_is_serial.drop_nulls() )

200

### ノック１８：顧客名をキーに２つのデータを結合(ジョイン)しよう

In [202]:
join_data = (
    uriage_data
    .rename(mapping = {"customer_name": "顧客名"})
    .join(other = kokyaku_data, on="顧客名", how="left")
)
join_data.head()

purchase_date,item_name,item_price,顧客名,purchase_month,かな,地域,メールアドレス,登録日,登録年月
datetime[μs],str,i64,str,str,str,str,str,datetime[μs],str
2019-06-13 18:02:34,"""商品A""",100.0,"""深井菜々美""","""201906""","""ふかい ななみ""","""C市""","""fukai_nanami@e…",2017-01-26 00:00:00,"""201701"""
2019-07-13 13:05:29,"""商品S""",,"""浅田賢二""","""201907""","""あさだ けんじ""","""C市""","""asada_kenji@ex…",2018-04-07 00:00:00,"""201804"""
2019-05-11 19:42:07,"""商品A""",,"""南部慶二""","""201905""","""なんぶ けいじ""","""A市""","""nannbu_keiji@e…",2018-06-19 00:00:00,"""201806"""
2019-02-12 23:40:45,"""商品Z""",2600.0,"""麻生莉緒""","""201902""","""あそう りお""","""D市""","""asou_rio@examp…",2018-07-22 00:00:00,"""201807"""
2019-04-22 03:09:35,"""商品A""",,"""平田鉄二""","""201904""","""ひらた てつじ""","""D市""","""hirata_tetsuji…",2017-06-07 00:00:00,"""201706"""


### ノック１９：クレンジングしたデータをダンプしよう

In [203]:
dump_data = join_data[["purchase_date", "purchase_month", "item_name", "item_price", "顧客名", "かな", "地域", "メールアドレス", "登録日"]]
dump_data.head()

purchase_date,purchase_month,item_name,item_price,顧客名,かな,地域,メールアドレス,登録日
datetime[μs],str,str,i64,str,str,str,str,datetime[μs]
2019-06-13 18:02:34,"""201906""","""商品A""",100.0,"""深井菜々美""","""ふかい ななみ""","""C市""","""fukai_nanami@e…",2017-01-26 00:00:00
2019-07-13 13:05:29,"""201907""","""商品S""",,"""浅田賢二""","""あさだ けんじ""","""C市""","""asada_kenji@ex…",2018-04-07 00:00:00
2019-05-11 19:42:07,"""201905""","""商品A""",,"""南部慶二""","""なんぶ けいじ""","""A市""","""nannbu_keiji@e…",2018-06-19 00:00:00
2019-02-12 23:40:45,"""201902""","""商品Z""",2600.0,"""麻生莉緒""","""あそう りお""","""D市""","""asou_rio@examp…",2018-07-22 00:00:00
2019-04-22 03:09:35,"""201904""","""商品A""",,"""平田鉄二""","""ひらた てつじ""","""D市""","""hirata_tetsuji…",2017-06-07 00:00:00


In [204]:
dump_data.write_csv("dump_data.csv")

### ノック２０：データを集計しよう

In [205]:
import_data = pl.read_csv("dump_data.csv")
import_data.head()

purchase_date,purchase_month,item_name,item_price,顧客名,かな,地域,メールアドレス,登録日
str,i64,str,i64,str,str,str,str,str
"""2019-06-13T18:…",201906,"""商品A""",100.0,"""深井菜々美""","""ふかい ななみ""","""C市""","""fukai_nanami@e…","""2017-01-26T00:…"
"""2019-07-13T13:…",201907,"""商品S""",,"""浅田賢二""","""あさだ けんじ""","""C市""","""asada_kenji@ex…","""2018-04-07T00:…"
"""2019-05-11T19:…",201905,"""商品A""",,"""南部慶二""","""なんぶ けいじ""","""A市""","""nannbu_keiji@e…","""2018-06-19T00:…"
"""2019-02-12T23:…",201902,"""商品Z""",2600.0,"""麻生莉緒""","""あそう りお""","""D市""","""asou_rio@examp…","""2018-07-22T00:…"
"""2019-04-22T03:…",201904,"""商品A""",,"""平田鉄二""","""ひらた てつじ""","""D市""","""hirata_tetsuji…","""2017-06-07T00:…"


In [206]:
byItem = (
    import_data
    .pivot(index = "purchase_month",
           columns = "item_name",
           values = "item_name",
           aggregate_function = "count",
           sort_columns = True)
           .sort(by = "purchase_month")
)
byItem

purchase_month,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z
i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
201901,18,13,19,17,18,15,11,16,18,17,20,19,19,16,24,16,17,21,20,17,7,22,13,14,10,
201902,19,14,26,21,16,14,14,17,12,14,16,11,15,20,19,19,22,22,22,23,19,22,24,16,11,1.0
201903,17,21,20,17,9,27,14,18,12,16,14,20,22,13,11,21,23,16,20,12,23,18,16,21,16,
201904,17,19,24,20,18,17,14,11,18,13,14,15,11,20,15,15,20,20,16,16,11,15,14,16,20,
201905,24,14,16,14,19,18,23,15,16,11,18,13,18,19,18,20,13,22,18,16,16,9,21,16,20,
201906,24,12,11,19,13,18,15,13,19,22,15,15,17,16,15,18,15,16,21,12,18,20,17,15,13,
201907,20,20,17,17,12,17,19,19,19,23,12,17,11,15,22,26,15,19,23,21,13,28,16,18,12,


In [207]:
byPrice = (
    import_data
    .pivot(index = "purchase_month",
           columns = "item_name",
           values = "item_price",
           aggregate_function = "sum",
           sort_columns = True)
           .sort(by = "purchase_month")
)
byPrice

purchase_month,商品A,商品B,商品C,商品D,商品E,商品F,商品G,商品H,商品I,商品J,商品K,商品L,商品M,商品N,商品O,商品P,商品Q,商品R,商品S,商品T,商品U,商品V,商品W,商品X,商品Y,商品Z
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
201901,1500,1600,5400,6000,6500,7800,7000,11200,14400,16000,17600,20400,20800,22400,31500,22400,25500,36000,32300,30000,12600,44000,27600,28800,20000,
201902,1700,2400,6000,8000,5500,6600,7700,12800,9000,12000,14300,13200,16900,25200,27000,30400,30600,34200,34200,42000,37800,41800,43700,36000,20000,2600.0
201903,1300,4200,5100,6400,4500,12000,7700,14400,10800,14000,14300,21600,27300,16800,15000,33600,34000,27000,26600,22000,46200,35200,34500,38400,35000,
201904,1400,2200,6000,7200,8500,9000,9800,7200,14400,13000,12100,15600,14300,22400,19500,22400,30600,30600,30400,30000,16800,22000,23000,38400,40000,
201905,2100,2600,4200,5200,9000,9600,14000,12000,13500,9000,17600,13200,22100,23800,24000,28800,17000,32400,24700,28000,33600,11000,39100,31200,47500,
201906,2100,2400,2700,7200,6500,9600,9100,9600,13500,20000,15400,14400,19500,15400,21000,25600,23800,27000,34200,22000,31500,33000,27600,26400,30000,
201907,1600,3600,4500,6000,5500,9600,11900,13600,14400,17000,9900,20400,13000,18200,28500,38400,20400,27000,38000,36000,23100,57200,32200,38400,27500,


In [208]:
byCustomer = (
    import_data
    .pivot(index = "purchase_month",
           columns = "顧客名",
           values = "顧客名",
           aggregate_function = "count",
           sort_columns = True)
           .sort(by = "purchase_month")
)
byCustomer

purchase_month,さだ千佳子,中仁晶,中田美智子,丸山光臣,久保田倫子,亀井一徳,五十嵐春樹,井上桃子,井口寛治,井川真悠子,井川里穂,井本マサカズ,井村俊二,今茜,佐藤慶二,八木雅彦,内村まさみ,内田聡,南部慶二,原口俊二,古川信吾,合田光,吉岡サダヲ,吉村愛梨,和泉直人,唐沢景子,唐沢涼,土屋朝陽,城戸芳正,堀サンタマリア,堀内聡,堀北雅彦,堀江佑,外山広司,大倉晃司,大地礼子,大城ケンイチ,大山咲,大崎ヒカル,大滝麗奈,大西隆之介,奥光洋,宇野秀樹,小口豊,小川美菜,小平陽子,小松季衣,小松隼士,小栗正義,小町瞬,尾上勝久,尾形小雁,山口法子,山西花,岡慶太,岡村希,岡田敏也,岩井莉緒,岩佐孝太郎,岩城徹平,岩沢那奈,島孝太郎,島崎礼子,島本研二,島英嗣,島袋友以乃,川上りえ,川島友以乃,市田寿明,平田鉄二,平賀一哉,影山輝信,徳重優,志村サダヲ,戸塚美幸,手塚進,手塚雅之,新村丈史,新村美月,新谷智花,日比野徹,日野夏希,明石家明,星野美嘉,有馬徹平,望月真悠子,本多フミヤ,本橋直人,杉下悟志,杉田将也,村山知世,東光博,松元翔太,松居満,松岡ノブヒコ,松川綾女,松村聡,松沢育二,松田浩正,松谷愛子,板橋隆,林勇,栗田憲一,根岸仁晶,根岸莉央,根本博明,桑原桃子,梅村秀樹,梅沢麻緒,梅津淳子,森岡季衣,植木沙知絵,植村遥,楠哲平,楠高史,榊原しぼり,榎本薫,横田遥,水野メイサ,沖遥,河内さとみ,河村由樹,浅田賢二,浅見広司,浜田未華子,深井照生,深井菜々美,深沢ひろ子,深田信輔,清水佑,清水裕次郎,熊井憲史,熊倉明日,熊倉綾,片瀬長利,牧田玲那,田上美佐子,田崎菜々美,田畑正敏,田辺きみまろ,田辺光洋,白井俊二,白鳥りえ,相原ひとり,相川良介,矢沢恵梨香,矢部夏空,矢部惇,矢部美幸,石崎幸子,石川まさみ,石渡小雁,石田佑,石田花,石田郁恵,石野仁,磯野希,神原美嘉,福島友也,福本美幸,秋葉あき,稲田将也,立石茜,笹原しぼり,笹川照生,篠山雅功,米沢仁晶,綾瀬俊介,芦田博之,芳賀希,若杉徹,荻野愛,荻野愛菜,菅原誠治,藤広之,藤木一恵,西原未華子,西島知世,西脇礼子,谷本愛梨,赤木だん吉,赤木愛梨,進藤瞬,那須蒼甫,野本仁晶,金森なつみ,鈴木一哉,青山鉄洋,須賀ひとみ,香椎優一,高原充則,高梨結衣,高沢美咲,高田さんま,鳥居広司,鶴岡薫,麻生莉緒,黄川田博之,黒谷長利
i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
201901,3.0,1.0,4.0,2,2.0,,5,3.0,3.0,1,4.0,1,4.0,3,4,4,2.0,1,,4.0,2,1.0,1.0,2,3.0,1.0,,1,2.0,2.0,3.0,1,4,6.0,,4.0,1,3,2,3,,2,2.0,2.0,3.0,5,,5.0,1,2.0,3,2,4.0,4.0,3.0,1.0,2,,1,2.0,1.0,,1.0,2.0,2.0,5,2.0,6.0,3,1,3.0,,,4,5,1,3,1.0,2.0,3.0,1,3.0,,3.0,4,1.0,3.0,1.0,3.0,4.0,2,2,1.0,1.0,1.0,1,3,3.0,2.0,3,3.0,1,2,,1.0,2.0,,2,2.0,2,,1.0,1,1.0,2.0,2,2.0,1.0,,2,2,2.0,3,,4,3.0,1.0,1,3.0,2,1,,1,2,1,4.0,3.0,2,1,1.0,2.0,2,3.0,1.0,1.0,1.0,1.0,2.0,1,3,4.0,1.0,3,3.0,3,3.0,2.0,,4,1,1.0,2.0,4,,1,1,4,4.0,2,1,2.0,3,2,4.0,4.0,,7.0,1,,2,5,3,2,2.0,2.0,2.0,4.0,2.0,2,,1.0,1.0,1.0,5.0,2.0,,2,2.0,5.0
201902,9.0,1.0,2.0,2,1.0,4.0,2,1.0,,4,2.0,3,1.0,1,3,2,3.0,2,4.0,2.0,1,2.0,,1,5.0,1.0,1.0,3,1.0,3.0,3.0,1,3,1.0,2.0,2.0,1,1,1,1,1.0,3,,2.0,1.0,3,6.0,1.0,4,3.0,2,5,2.0,2.0,5.0,5.0,4,4.0,4,1.0,,,,3.0,3.0,1,5.0,2.0,3,4,2.0,2.0,3.0,1,5,1,5,1.0,,1.0,5,2.0,,,1,1.0,2.0,4.0,,,2,4,5.0,4.0,1.0,4,3,2.0,3.0,2,1.0,4,3,2.0,1.0,,3.0,1,4.0,1,3.0,1.0,1,2.0,1.0,1,3.0,,4.0,3,5,,2,3.0,3,3.0,3.0,1,,2,7,4.0,1,1,3,5.0,3.0,2,4,2.0,,4,,2.0,3.0,2.0,,1.0,2,3,5.0,5.0,3,2.0,2,5.0,3.0,4.0,2,2,5.0,2.0,1,,1,2,2,3.0,2,1,1.0,2,2,7.0,1.0,1.0,2.0,1,1.0,1,3,3,2,1.0,2.0,4.0,1.0,4.0,3,4.0,,3.0,2.0,,1.0,2.0,4,,1.0
201903,1.0,2.0,1.0,6,1.0,4.0,3,3.0,2.0,2,1.0,2,,3,2,2,3.0,2,1.0,5.0,1,4.0,2.0,2,,3.0,4.0,1,,1.0,5.0,2,5,,1.0,2.0,1,3,2,3,,2,,4.0,,1,,2.0,2,,2,3,3.0,1.0,2.0,5.0,3,2.0,2,,1.0,1.0,4.0,2.0,3.0,3,2.0,2.0,2,3,,5.0,1.0,3,5,1,2,2.0,4.0,2.0,2,1.0,4.0,,3,,1.0,1.0,1.0,2.0,1,5,2.0,2.0,,2,5,6.0,2.0,1,,5,5,2.0,4.0,2.0,,2,,2,2.0,,3,2.0,3.0,5,2.0,3.0,,4,5,1.0,2,3.0,4,2.0,4.0,2,5.0,2,1,3.0,2,2,4,1.0,4.0,2,2,1.0,1.0,3,2.0,1.0,2.0,3.0,4.0,2.0,3,2,4.0,,2,,4,3.0,3.0,3.0,2,2,2.0,1.0,1,2.0,5,5,1,2.0,1,1,,1,2,1.0,5.0,3.0,1.0,3,2.0,1,3,1,1,5.0,2.0,,1.0,,2,3.0,1.0,6.0,2.0,4.0,2.0,4.0,2,2.0,1.0
201904,,3.0,1.0,2,,2.0,2,,3.0,2,2.0,1,4.0,1,3,1,1.0,4,5.0,3.0,1,2.0,1.0,3,4.0,4.0,4.0,6,,,1.0,4,3,1.0,2.0,,2,2,2,1,2.0,4,1.0,1.0,2.0,2,3.0,2.0,2,1.0,3,4,3.0,,1.0,,3,3.0,1,1.0,1.0,2.0,,3.0,5.0,5,3.0,3.0,1,3,1.0,1.0,2.0,2,4,2,1,2.0,1.0,2.0,3,1.0,3.0,,1,3.0,1.0,1.0,3.0,4.0,1,2,2.0,1.0,1.0,2,4,,5.0,3,2.0,4,1,4.0,1.0,1.0,4.0,4,,5,,1.0,5,,6.0,1,,3.0,1.0,1,5,2.0,4,4.0,3,,1.0,2,1.0,4,3,1.0,2,1,2,5.0,1.0,1,2,,4.0,2,1.0,,4.0,,3.0,1.0,4,2,,,3,1.0,1,,1.0,1.0,1,2,2.0,,2,1.0,5,2,4,2.0,1,2,2.0,4,2,3.0,1.0,1.0,1.0,4,,1,1,3,4,3.0,,1.0,3.0,2.0,3,2.0,4.0,2.0,3.0,4.0,3.0,2.0,1,2.0,
201905,3.0,2.0,5.0,2,4.0,1.0,2,1.0,3.0,3,,3,1.0,1,1,3,2.0,1,2.0,1.0,3,1.0,,6,,,4.0,3,,2.0,1.0,1,3,,1.0,,1,2,4,4,,2,1.0,1.0,3.0,1,2.0,1.0,1,1.0,3,2,1.0,2.0,3.0,3.0,2,2.0,2,2.0,3.0,5.0,2.0,1.0,6.0,3,,,2,2,3.0,2.0,3.0,4,1,3,4,,5.0,,5,6.0,1.0,4.0,3,,5.0,4.0,,,2,3,3.0,2.0,3.0,5,2,4.0,5.0,1,,2,2,3.0,2.0,4.0,2.0,2,1.0,4,1.0,2.0,1,2.0,,2,5.0,1.0,2.0,1,1,1.0,3,1.0,1,3.0,,1,1.0,1,1,3.0,1,1,2,2.0,3.0,1,3,1.0,,2,3.0,1.0,4.0,3.0,3.0,2.0,6,1,6.0,2.0,4,1.0,2,2.0,3.0,5.0,4,6,4.0,,2,1.0,1,2,1,4.0,2,1,2.0,4,1,1.0,,5.0,1.0,2,1.0,4,2,4,2,,1.0,1.0,6.0,2.0,2,1.0,1.0,1.0,,2.0,2.0,3.0,4,4.0,1.0
201906,1.0,3.0,,4,1.0,1.0,1,2.0,2.0,3,2.0,1,1.0,1,2,4,,5,2.0,,1,,2.0,3,1.0,2.0,3.0,1,,,2.0,4,4,2.0,3.0,3.0,4,4,1,2,1.0,4,2.0,1.0,2.0,2,2.0,,4,2.0,1,1,,6.0,1.0,1.0,4,2.0,4,2.0,4.0,2.0,1.0,2.0,,2,1.0,3.0,3,3,1.0,2.0,1.0,2,1,1,1,2.0,3.0,3.0,3,,1.0,4.0,2,,5.0,2.0,3.0,,1,2,,3.0,2.0,2,3,2.0,,2,3.0,1,1,3.0,3.0,,6.0,2,1.0,4,1.0,2.0,2,2.0,3.0,1,4.0,3.0,1.0,4,5,3.0,2,1.0,5,,5.0,4,2.0,1,2,5.0,3,1,1,,,2,2,2.0,1.0,1,3.0,2.0,,,1.0,4.0,1,2,3.0,,2,,2,2.0,2.0,2.0,2,6,,2.0,1,3.0,1,3,2,4.0,1,1,4.0,3,2,1.0,5.0,2.0,,1,2.0,5,1,3,3,1.0,2.0,,3.0,3.0,4,7.0,3.0,,2.0,1.0,,2.0,1,2.0,4.0
201907,3.0,,3.0,2,5.0,3.0,5,2.0,5.0,5,6.0,2,,2,1,1,3.0,3,,1.0,2,2.0,2.0,1,2.0,2.0,2.0,1,1.0,2.0,,1,1,2.0,1.0,2.0,1,1,1,1,3.0,1,4.0,,3.0,2,2.0,3.0,1,1.0,1,2,2.0,2.0,,3.0,1,1.0,6,3.0,,4.0,,,,7,3.0,2.0,4,3,3.0,2.0,5.0,4,3,4,1,2.0,5.0,2.0,5,2.0,2.0,4.0,4,2.0,,,5.0,1.0,3,3,1.0,,3.0,1,1,1.0,2.0,2,4.0,3,5,4.0,,4.0,4.0,2,5.0,1,4.0,2.0,2,2.0,1.0,1,3.0,4.0,1.0,1,1,,2,1.0,2,6.0,3.0,2,2.0,2,1,1.0,4,1,1,2.0,2.0,3,1,2.0,5.0,1,3.0,2.0,4.0,1.0,6.0,,9,1,3.0,4.0,2,3.0,3,2.0,,1.0,3,4,3.0,3.0,1,1.0,1,2,2,,2,3,4.0,2,1,,1.0,2.0,1.0,1,4.0,3,4,2,3,4.0,3.0,2.0,,,4,2.0,4.0,4.0,2.0,,2.0,4.0,3,4.0,1.0


In [209]:
byRegion = (
    import_data
    .pivot(index = "purchase_month",
           columns = "地域",
           values = "地域",
           aggregate_function = "count",
           sort_columns = True)
           .sort(by = "purchase_month")
)
byRegion

purchase_month,A市,B市,C市,D市,E市,F市,G市,H市
i64,u32,u32,u32,u32,u32,u32,u32,u32
201901,59,55,72,34,49,57,49,42
201902,71,46,65,48,61,52,43,63
201903,64,52,57,43,52,59,51,59
201904,64,48,54,45,48,58,40,52
201905,57,52,68,48,59,65,35,43
201906,53,47,61,30,51,51,58,58
201907,76,53,61,42,54,64,47,54


In [210]:
#away_data = pl.merge(uriage_data, kokyaku_data, left_on="customer_name", right_on="顧客名", how="right")
#away_data[away_data["purchase_date"].isnull()][["顧客名", "メールアドレス", "登録日"]]

away_data = (
    uriage_data
    .rename(mapping = {"customer_name": "顧客名"})
    .join(other = kokyaku_data, on="顧客名", how="left")
)
away_data.filter( pl.col("purchase_date").is_null() )[["顧客名", "メールアドレス", "登録日"]]


顧客名,メールアドレス,登録日
str,str,datetime[μs]
