# データ処理方法 -polars- 
こちらはpolarsのコードを書く．

## ライブラリのインポート

In [1]:
import polars as pl

In [2]:
print(pl.__version__)

1.3.0


## データの読み込み
今回は3種類のデータを用意
* データ1
    * 100万行4列
* データ2
    * 50万行4列
    * データ1と同じ列
* 重みデータ
    * データ1とデータ2に存在する2つの列("group1", "group2")の各値に対する重み値を保存しているデータ

In [3]:
%%timeit
data1 = pl.read_csv("./sample_data/data1.csv")

297 ms ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
data1 = pl.read_csv("./sample_data/data1.csv")
data2 = pl.read_csv("./sample_data/data2.csv")
weight_data = pl.read_excel("./sample_data/weight_data.xlsx")

## データの確認

* サイズの確認
    * `shape`
* データの表示
    * `head()`
    * `tail()`
    * `sample()`
* 要約量の表示
    * `describe()`

In [5]:
data1.shape, data2.shape, weight_data.shape

((1000000, 4), (500000, 4), (9, 3))

In [6]:
# 先頭から10行を表示
data1.head(n=10)

id,group1,group2,value
i64,str,str,f64
1,"""C""","""X""",0.875341
2,"""C""","""Z""",-1.274481
3,"""A""","""Z""",0.29918
4,"""C""","""Z""",-1.326054
5,"""B""","""Z""",0.166297
6,"""A""","""Z""",-0.407868
7,"""C""","""Y""",-1.155394
8,"""A""","""Y""",0.717269
9,"""B""","""Z""",-0.385354
10,"""B""","""X""",-1.148564


In [7]:
# ランダムに5行表示
data2.sample(n=5)

id,group1,group2,value
i64,str,str,f64
1255388,"""A""","""Y""",-1.170482
1051319,"""C""","""Z""",0.75141
1467314,"""B""","""X""",1.67183
1123499,"""A""","""Z""",-1.904677
1312794,"""C""","""Y""",-0.55197


In [8]:
weight_data.tail(n=9)

group1,group2,weight
str,str,i64
"""A""","""X""",5
"""A""","""Y""",6
"""A""","""Z""",1
"""B""","""X""",4
"""B""","""Y""",9
"""B""","""Z""",8
"""C""","""X""",5
"""C""","""Y""",4
"""C""","""Z""",4


In [9]:
# 要約量を表示する
data1.describe()

statistic,id,group1,group2,value
str,f64,str,str,f64
"""count""",1000000.0,"""967744""","""967886""",1000000.0
"""null_count""",0.0,"""32256""","""32114""",0.0
"""mean""",500000.5,,,0.000969
"""std""",288675.278932,,,1.001348
"""min""",1.0,"""A""","""X""",-4.729759
"""25%""",250001.0,,,-0.675543
"""50%""",500001.0,,,0.00025
"""75%""",750000.0,,,0.675406
"""max""",1000000.0,"""C""","""Z""",4.715991


## データ操作
ここでの操作方法は後の前処理でたくさん使われる．
* 列の操作
    * `select()`
* 行の操作
    * `filter()`
* グループ化
    * `group_by()`

### 列の操作

In [10]:
# すべての列を表示
data1.select(pl.col("*"))

id,group1,group2,value
i64,str,str,f64
1,"""C""","""X""",0.875341
2,"""C""","""Z""",-1.274481
3,"""A""","""Z""",0.29918
4,"""C""","""Z""",-1.326054
5,"""B""","""Z""",0.166297
…,…,…,…
999996,"""A""","""X""",0.503502
999997,"""B""","""Y""",-1.03132
999998,"""A""","""Y""",-0.157509
999999,"""A""","""Z""",0.811663


In [11]:
# 正規表現で列を抽出する
data1.select(pl.col("^gro.*$"))

group1,group2
str,str
"""C""","""X"""
"""C""","""Z"""
"""A""","""Z"""
"""C""","""Z"""
"""B""","""Z"""
…,…
"""A""","""X"""
"""B""","""Y"""
"""A""","""Y"""
"""A""","""Z"""


In [12]:
# ある列以外を抽出する
data1.select(pl.exclude("^gro.*$"))

id,value
i64,f64
1,0.875341
2,-1.274481
3,0.29918
4,-1.326054
5,0.166297
…,…
999996,0.503502
999997,-1.03132
999998,-0.157509
999999,0.811663


In [13]:
# ある列に出現する値の個数を数える
data1.select(pl.col("group1").value_counts())

group1
struct[2]
"{null,32256}"
"{""B"",322250}"
"{""A"",322207}"
"{""C"",323287}"


### 行の操作

In [14]:
# 行の抽出
data1.filter(pl.col("group1") == "A")

id,group1,group2,value
i64,str,str,f64
3,"""A""","""Z""",0.29918
6,"""A""","""Z""",-0.407868
8,"""A""","""Y""",0.717269
13,"""A""","""Z""",-0.299052
14,"""A""","""Z""",0.623657
…,…,…,…
999983,"""A""","""X""",-0.613297
999985,"""A""","""Z""",0.457641
999996,"""A""","""X""",0.503502
999998,"""A""","""Y""",-0.157509


In [15]:
# 複数条件を使う
# 変数を使うことも可能
foo = "X"
data1.filter((pl.col("group1") == "A") & (pl.col("group2") == f"{foo}"))
# & の場合は `,` で書くこともできる
# data1.filter(pl.col("group1") == "A", pl.col("group2") == "X")
# OR の場合 `&` ではなく， `|` を使う
# data1.filter((pl.col("group1") == "A") | (pl.col("group2") == "X"))

id,group1,group2,value
i64,str,str,f64
33,"""A""","""X""",-0.35049
39,"""A""","""X""",2.41911
47,"""A""","""X""",0.155541
48,"""A""","""X""",0.567384
61,"""A""","""X""",-0.182846
…,…,…,…
999954,"""A""","""X""",2.055218
999955,"""A""","""X""",0.465499
999968,"""A""","""X""",-0.4807
999983,"""A""","""X""",-0.613297


In [16]:
# リストを条件に使う時
data1.filter(pl.col("group1").is_in(["A", "C"]))

id,group1,group2,value
i64,str,str,f64
1,"""C""","""X""",0.875341
2,"""C""","""Z""",-1.274481
3,"""A""","""Z""",0.29918
4,"""C""","""Z""",-1.326054
6,"""A""","""Z""",-0.407868
…,…,…,…
999994,"""C""",,-0.180725
999995,"""C""","""X""",1.733561
999996,"""A""","""X""",0.503502
999998,"""A""","""Y""",-0.157509


In [17]:
# 欠損値を表示させる
# group1, group2が共に欠損の行を抽出する
data1.filter(pl.col("group1").is_null(), pl.col("group2").is_null())

id,group1,group2,value
i64,str,str,f64
285,,,-0.619982
817,,,-1.530416
1207,,,-0.837263
3683,,,-0.010933
5073,,,0.218643
…,…,…,…
996682,,,-0.574811
997955,,,-0.833321
998194,,,-0.306645
998586,,,1.103169


### グループ化

In [18]:
# グループ化の欠損値は除外されない
# また順序は保存されない(maintain_order = True とすることで，順序の保持が可能．ただし性能が落ちるので，あとでソートすれば良さそう)
data1.group_by("group1").agg(pl.mean("value"))

group1,value
str,f64
"""B""",0.001216
"""C""",0.002538
,-0.006814
"""A""",-7.2e-05


## 前処理

* 重複の削除
* データフレームの結合
* 欠損値処理
* 新しい列の作成
* 縦長データ <-> 横長データの変換

In [19]:
df = pl.DataFrame({"A": ["X", "X", "Y"], "B": [1, 1, 1]})
df.unique()

A,B
str,i64
"""Y""",1
"""X""",1


### データフレームの結合
* 縦に結合する
    * `vstack()`
* 横に結合する
    * `join()`

In [20]:
# 縦に結合
# data1の下にdata2を結合し，新しいデータフレームを作成する
data12 = data1.vstack(data2)
data12.shape

(1500000, 4)

In [21]:
data12.head()

id,group1,group2,value
i64,str,str,f64
1,"""C""","""X""",0.875341
2,"""C""","""Z""",-1.274481
3,"""A""","""Z""",0.29918
4,"""C""","""Z""",-1.326054
5,"""B""","""Z""",0.166297


In [22]:
# weight_dataを結合するために，データを確認する
weight_data.head(n=10)

group1,group2,weight
str,str,i64
"""A""","""X""",5
"""A""","""Y""",6
"""A""","""Z""",1
"""B""","""X""",4
"""B""","""Y""",9
"""B""","""Z""",8
"""C""","""X""",5
"""C""","""Y""",4
"""C""","""Z""",4


In [23]:
# 横に結合する
# キーはgroup1列，group2列同士で行う
all_data = data12.join(other=weight_data,
                       left_on=["group1", "group2"],
                       right_on=["group1", "group2"],
                       how="left")

In [24]:
all_data.shape

(1500000, 5)

In [25]:
# データを表示
all_data.sample(n=5)

id,group1,group2,value,weight
i64,str,str,f64,i64
148706,"""A""","""Z""",0.944855,1
184594,"""C""","""Z""",0.312546,4
1104865,"""C""","""X""",0.204857,5
696629,"""B""","""Z""",-1.188813,8
379543,"""B""","""Y""",0.795852,9


In [26]:
# 欠損値を持つ行を表示．
# 結合方法を左結合にしたため，左で欠損を持つ行(厳密には，右側のデータにない結合キー)も残っている
all_data.filter(pl.col("group1").is_null())

id,group1,group2,value,weight
i64,str,str,f64,i64
18,,"""Y""",0.405518,
20,,"""Y""",0.105258,
67,,"""Z""",0.588424,
68,,"""Y""",1.016913,
78,,"""Y""",-0.061921,
…,…,…,…,…
999826,,"""Z""",0.200896,
999932,,"""Y""",-0.607453,
999952,,"""Y""",-1.488471,
999967,,"""Z""",-0.100375,


### 欠損値処理
* 欠損値の除去
    * `drop_nulls()`
* 欠損値の補完
    * `fill_null()`

ここでは，分かりやすさのために，少数データで確認する

In [27]:
sample_data = pl.DataFrame(
    {
        "a": [None, None, None, 3],
        "b": [1, 2, None, 1],
        "c": [1, None, None, 1],
    }
)
sample_data

a,b,c
i64,i64,i64
,1.0,1.0
,2.0,
,,
3.0,1.0,1.0


In [28]:
# b列が欠損である行を削除する
sample_data.drop_nulls("b")

a,b,c
i64,i64,i64
,1,1.0
,2,
3.0,1,1.0


In [29]:
# 1列でも欠損値を持つ行を削除する
sample_data.drop_nulls()

a,b,c
i64,i64,i64
3,1,1


In [30]:
# すべての列の値が空白である行を削除する
sample_data.filter(~pl.all_horizontal(pl.all().is_null()))

a,b,c
i64,i64,i64
,1,1.0
,2,
3.0,1,1.0


In [31]:
# 全ての行が欠損である列を削除する
sample_data2 = pl.DataFrame(
    {
        "a": [None, None, None],
        "b": [1, 2, None],
        "c": [1, None, None],
    }
)
print(sample_data2)
sample_data2[[s.name for s in sample_data2 if not (s.null_count() == sample_data2.height)]]

shape: (3, 3)
┌──────┬──────┬──────┐
│ a    ┆ b    ┆ c    │
│ ---  ┆ ---  ┆ ---  │
│ null ┆ i64  ┆ i64  │
╞══════╪══════╪══════╡
│ null ┆ 1    ┆ 1    │
│ null ┆ 2    ┆ null │
│ null ┆ null ┆ null │
└──────┴──────┴──────┘


b,c
i64,i64
1.0,1.0
2.0,
,


In [32]:
# 欠損値の補完
sample_data.fill_null(-99)

a,b,c
i64,i64,i64
-99,1,1
-99,2,-99
-99,-99,-99
3,1,1


In [33]:
# 欠損値を各列の最大値で補完
sample_data.fill_null(strategy="max")

a,b,c
i64,i64,i64
3,1,1
3,2,1
3,2,1
3,1,1


In [34]:
# 今後の処理で欠損値があると良くないので，最初のデータの欠損値を削除する
all_data = all_data.drop_nulls()

### 新しい列の作成
* 列の作成
    * `with_columns()`
    * かなり自由度が高く，四則演算や文字列の扱い，条件分岐もデフォルトの関数で使用可能．
    * 定義済みの関数なためパフォーマンスが非常に高い
    * もちろん，自作関数も使用できるが，パフォーマンスが落ちるため，可能な限り適宜済み関数を用いる
    * 自作関数を用いる場合，`map_elements(my_func, return_dtype=pl.xxx)`を用い

In [35]:
all_data.with_columns(
    # 四則演算や文字列の扱い
    (pl.col("value")*2).alias("value_x2"),
    (pl.col("group1") + "-" + pl.col("group2")).alias("group1_2"),
    (pl.col("value") * pl.col("weight")).alias("value_x_weight"),
).with_columns(
    # 条件分岐
    pl.when(pl.col("group1") == "A")
    .then(pl.col("value"))
    .when(pl.col("group1") == "B")
    .then(pl.col("value") * 100)
    .otherwise(0)
    .alias("foo")
).with_columns(
    # グループごとに統計量を求め，その値を各行に返すことも可能
    pl.col("value").max().over("group1").alias("group1_max")
).with_columns(
    (pl.col("value") - pl.col("group1_max")).alias("value_diff_max_group1")
)

id,group1,group2,value,weight,value_x2,group1_2,value_x_weight,foo,group1_max,value_diff_max_group1
i64,str,str,f64,i64,f64,str,f64,f64,f64,f64
1,"""C""","""X""",0.875341,5,1.750682,"""C-X""",4.376704,0.0,4.715991,-3.84065
2,"""C""","""Z""",-1.274481,4,-2.548962,"""C-Z""",-5.097924,0.0,4.715991,-5.990472
3,"""A""","""Z""",0.29918,1,0.59836,"""A-Z""",0.29918,0.29918,4.487273,-4.188093
4,"""C""","""Z""",-1.326054,4,-2.652108,"""C-Z""",-5.304215,0.0,4.715991,-6.042045
5,"""B""","""Z""",0.166297,8,0.332593,"""B-Z""",1.330373,16.629663,4.685746,-4.51945
…,…,…,…,…,…,…,…,…,…,…
1499996,"""A""","""X""",0.166501,5,0.333003,"""A-X""",0.832507,0.166501,4.487273,-4.320771
1499997,"""B""","""Z""",-1.279135,8,-2.558271,"""B-Z""",-10.233083,-127.913539,4.685746,-5.964882
1499998,"""A""","""Y""",-0.966532,6,-1.933065,"""A-Y""",-5.799194,-0.966532,4.487273,-5.453805
1499999,"""C""","""X""",0.837054,5,1.674108,"""C-X""",4.185271,0.0,4.715991,-3.878937


In [36]:
# 自作関数を用いる場合
def my_func(s: str):
    if s == "X":
        return "XXX"
    elif s == "Y":
        return "YY"
    elif s == "Z":
        return "ZZZZ"
    else:
        "bar"

In [37]:
all_data.with_columns(
    pl.col("group2").map_elements(my_func, return_dtype=pl.String)
)

id,group1,group2,value,weight
i64,str,str,f64,i64
1,"""C""","""XXX""",0.875341,5
2,"""C""","""ZZZZ""",-1.274481,4
3,"""A""","""ZZZZ""",0.29918,1
4,"""C""","""ZZZZ""",-1.326054,4
5,"""B""","""ZZZZ""",0.166297,8
…,…,…,…,…
1499996,"""A""","""XXX""",0.166501,5
1499997,"""B""","""ZZZZ""",-1.279135,8
1499998,"""A""","""YY""",-0.966532,6
1499999,"""C""","""XXX""",0.837054,5


### 縦長データ <-> 横長データの変換
* 縦長データ -> 横長データ
    * `pivot`
* 横長データ -> 縦長データ
    * `unpivot()`

In [38]:
sample_data = pl.DataFrame(
    {
        "id": [1, 1, 1, 2, 2, 2],
        "cat": ["A", "B", "C", "A", "B", "C"],
        "value1": [10, 12, 14, 16, 18, 20],
        "value2": [1, 2, 4, 6, 8, 10],
    }
)
sample_data

id,cat,value1,value2
i64,str,i64,i64
1,"""A""",10,1
1,"""B""",12,2
1,"""C""",14,4
2,"""A""",16,6
2,"""B""",18,8
2,"""C""",20,10


In [39]:
sample_data.pivot(on="cat", index="id", values=["value1", "value2"])

id,value1_A,value1_B,value1_C,value2_A,value2_B,value2_C
i64,i64,i64,i64,i64,i64,i64
1,10,12,14,1,2,4
2,16,18,20,6,8,10


In [40]:
# ピボット解除
sample_data.unpivot(on=["value1", "value2"], index=["id", "cat"])

id,cat,variable,value
i64,str,str,i64
1,"""A""","""value1""",10
1,"""B""","""value1""",12
1,"""C""","""value1""",14
2,"""A""","""value1""",16
2,"""B""","""value1""",18
…,…,…,…
1,"""B""","""value2""",2
1,"""C""","""value2""",4
2,"""A""","""value2""",6
2,"""B""","""value2""",8


In [41]:
sample_data2 = pl.DataFrame(
    {
        "id": [1, 1, 1, 2, 2, 2],
        "cat": ["A", "A", "C", "A", "B", "C"],
        "value1": [10, -10, 14, 16, 18, 20],
        "value2": [1, 2, 4, 6, 8, 10],
    }
)
sample_data2

id,cat,value1,value2
i64,str,i64,i64
1,"""A""",10,1
1,"""A""",-10,2
1,"""C""",14,4
2,"""A""",16,6
2,"""B""",18,8
2,"""C""",20,10


In [42]:
# 重複がある場合， `aggregate_function`に引数を与える
# 計算結果ではなく，どんな値が入っているかを確かめたいときは， `pl.element`とすると，値の配列が得られる
sample_data2.pivot(on="cat", index="id", values=["value1", "value2"], aggregate_function=pl.element())

id,value1_A,value1_C,value1_B,value2_A,value2_C,value2_B
i64,list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
1,"[10, -10]",[14],,"[1, 2]",[4],
2,[16],[20],[18],[6],[10],[8]
