# データ処理方法 -polars- 
こちらはpolarsのコードを書く．

## ライブラリのインポート

In [1]:
import polars as pl

In [2]:
print(pl.__version__)

1.3.0


## データの読み込み
今回は3種類のデータを用意
* データ1
    * 100万行4列
* データ2
    * 50万行4列
    * データ1と同じ列
* 重みデータ
    * データ1とデータ2に存在する2つの列("group1", "group2")の各値に対する重み値を保存しているデータ

In [116]:
%%timeit
data1 = pl.read_csv("./sample_data/data1.csv")

281 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
data2 = pl.read_csv("./sample_data/data2.csv")
weight_data = pl.read_excel("./sample_data/weight_data.xlsx")

## データの確認

* サイズの確認
    * `shape`
* データの表示
    * `head()`
    * `tail()`
    * `sample()`
* 要約量の表示
    * `describe()`
* グループごとに計算
    * `group_by()`

In [39]:
data1.shape, data2.shape, weight_data.shape

((1000000, 4), (500000, 4), (9, 3))

In [6]:
# 先頭から10行を表示
data1.head(n=10)

id,group1,group2,value
i64,str,str,f64
1,"""B""","""Y""",0.177553
2,"""A""","""X""",-0.356422
3,"""A""","""Y""",-0.722236
4,"""A""","""X""",0.631321
5,"""C""","""X""",-0.667093
6,"""C""","""Y""",-0.400045
7,"""C""","""Y""",-0.113321
8,"""A""","""Y""",-1.529201
9,"""B""","""Y""",0.387957
10,"""A""","""Y""",-0.191305


In [46]:
# ランダムに5行表示
data2.sample(n=5)

id,group1,group2,value
i64,str,str,f64
1361753,"""B""","""Y""",-0.055159
1392428,"""C""","""Z""",0.88846
1067036,"""C""","""X""",-0.196577
1483125,"""B""","""Y""",-0.347477
1218072,"""A""","""Z""",2.010337


In [48]:
weight_data.tail(n=9)

group1,group2,weight
str,str,i64
"""A""","""X""",5
"""A""","""Y""",6
"""A""","""Z""",1
"""B""","""X""",4
"""B""","""Y""",9
"""B""","""Z""",8
"""C""","""X""",5
"""C""","""Y""",4
"""C""","""Z""",4


In [9]:
# 要約量を表示する
data1.describe()

statistic,id,group1,group2,value
str,f64,str,str,f64
"""count""",1000000.0,"""967878""","""967614""",1000000.0
"""null_count""",0.0,"""32122""","""32386""",0.0
"""mean""",500000.5,,,-0.00055
"""std""",288675.278932,,,1.000681
"""min""",1.0,"""A""","""X""",-4.981772
"""25%""",250001.0,,,-0.674317
"""50%""",500001.0,,,-0.001001
"""75%""",750000.0,,,0.673354
"""max""",1000000.0,"""C""","""Z""",4.710577


## データ操作
ここでの操作方法は後の前処理でたくさん使われる．
* 列の操作
    * `select()`
* 行の操作
    * `filter()`
* グループ化
    * `group_by()`

### 列の操作

In [10]:
# すべての列を表示
data1.select(pl.col("*"))

id,group1,group2,value
i64,str,str,f64
1,"""B""","""Y""",0.177553
2,"""A""","""X""",-0.356422
3,"""A""","""Y""",-0.722236
4,"""A""","""X""",0.631321
5,"""C""","""X""",-0.667093
…,…,…,…
999996,"""C""","""Y""",-0.213528
999997,"""C""","""Y""",0.940286
999998,"""C""","""Z""",-0.449927
999999,"""B""","""Y""",0.411539


In [11]:
# 正規表現で列を抽出する
data1.select(pl.col("^gro.*$"))

group1,group2
str,str
"""B""","""Y"""
"""A""","""X"""
"""A""","""Y"""
"""A""","""X"""
"""C""","""X"""
…,…
"""C""","""Y"""
"""C""","""Y"""
"""C""","""Z"""
"""B""","""Y"""


In [23]:
# ある列以外を表示する
data1.select(pl.exclude("^gro.*$"))

id,value
i64,f64
1,0.177553
2,-0.356422
3,-0.722236
4,0.631321
5,-0.667093
…,…
999996,-0.213528
999997,0.940286
999998,-0.449927
999999,0.411539


In [21]:
# ある列に出現する値の個数を数える
data1.select(pl.col("group1").value_counts())

group1,group2
struct[2],struct[2]
"{""B"",322823}","{""Z"",322769}"
"{""A"",322416}","{""Y"",321854}"
"{""C"",322639}","{""X"",322991}"
"{null,32122}","{null,32386}"


### 行の操作

In [12]:
# 行の抽出
data1.filter(pl.col("group1") == "A")

id,group1,group2,value
i64,str,str,f64
2,"""A""","""X""",-0.356422
3,"""A""","""Y""",-0.722236
4,"""A""","""X""",0.631321
8,"""A""","""Y""",-1.529201
10,"""A""","""Y""",-0.191305
…,…,…,…
999981,"""A""","""X""",-2.077742
999983,"""A""","""Y""",-0.064892
999986,"""A""","""Y""",0.551829
999989,"""A""","""Y""",1.149937


In [15]:
# 複数条件を使う
data1.filter((pl.col("group1") == "A") & (pl.col("group2") == "X"))
# & の場合は `,` で書くこともできる
# data1.filter(pl.col("group1") == "A", pl.col("group2") == "X")
# OR の場合 `&` ではなく， `|` を使う
# data1.filter((pl.col("group1") == "A") | (pl.col("group2") == "X"))

id,group1,group2,value
i64,str,str,f64
2,"""A""","""X""",-0.356422
4,"""A""","""X""",0.631321
14,"""A""","""X""",-1.112415
24,"""A""","""X""",-1.574322
28,"""A""","""X""",0.766734
…,…,…,…
999954,"""A""","""X""",1.128896
999956,"""A""","""X""",-0.249153
999962,"""A""","""X""",-2.275963
999976,"""A""","""X""",0.208641


In [16]:
# 欠損値を表示させる
# group1, group2が共に欠損の行を抽出する
data1.filter(pl.col("group1").is_null(), pl.col("group2").is_null())

id,group1,group2,value
i64,str,str,f64
2055,,,0.4324
2494,,,-0.3083
3585,,,-1.197981
4574,,,-0.401393
4690,,,1.133604
…,…,…,…
995452,,,-1.247216
995544,,,-1.834185
995580,,,-0.401569
996032,,,-0.711723


### グループ化

In [36]:
data1.group_by("group1").agg(pl.mean("value"))

group1,value
str,f64
"""C""",-0.001734
,0.005503
"""B""",-0.000965
"""A""",0.000446


## 前処理

* データフレームの結合
* 欠損値処理
* 新しい列の作成
* 縦長データ <-> 横長データの変換

### データフレームの結合
* 縦に結合する
    * `vstack()`
* 横に結合する
    * `join()`

In [50]:
# 縦に結合
# data1の下にdata2を結合し，新しいデータフレームを作成する
data12 = data1.vstack(data2)
data12.shape

(1500000, 4)

In [51]:
data12.head()

id,group1,group2,value
i64,str,str,f64
1,"""B""","""Y""",0.177553
2,"""A""","""X""",-0.356422
3,"""A""","""Y""",-0.722236
4,"""A""","""X""",0.631321
5,"""C""","""X""",-0.667093


In [49]:
weight_data.head(n=10)

group1,group2,weight
str,str,i64
"""A""","""X""",5
"""A""","""Y""",6
"""A""","""Z""",1
"""B""","""X""",4
"""B""","""Y""",9
"""B""","""Z""",8
"""C""","""X""",5
"""C""","""Y""",4
"""C""","""Z""",4


In [52]:
# 横に結合する
# キーはgroup1列，group2列同士で行う
all_data = data12.join(other=weight_data,
                       left_on=["group1", "group2"],
                       right_on=["group1", "group2"],
                       how="left")

In [59]:
all_data.shape

(1500000, 5)

In [57]:
# データを表示
all_data.sample(n=5)

id,group1,group2,value,weight
i64,str,str,f64,i64
893922,"""B""","""X""",1.519066,4
1295541,"""A""","""X""",-0.86245,5
1373044,"""B""","""X""",-0.353661,4
487078,"""B""","""Z""",-0.586006,8
1073530,"""A""","""X""",0.917702,5


In [60]:
# 欠損値を持つ行を表示．
# 結合方法を左結合にしたため，左で欠損を持つ行(厳密には，右側のデータにない結合キー)も残っている
all_data.filter(pl.col("group1").is_null())

id,group1,group2,value,weight
i64,str,str,f64,i64
34,,"""Z""",0.232565,
61,,"""X""",-0.725587,
87,,"""X""",-1.782229,
97,,"""Z""",1.223954,
99,,"""Z""",1.494806,
…,…,…,…,…
999867,,"""Z""",0.530382,
999874,,"""X""",-0.206841,
999911,,"""X""",-0.581071,
999952,,"""Z""",-1.478041,


### 欠損値処理
* 欠損値の除去
    * `drop_nulls()`
* 欠損値の補完
    * `fill_null()`

ここでは，分かりやすさのために，少数データで確認する

In [81]:
sample_data = pl.DataFrame(
    {
        "a": [None, None, None, 3],
        "b": [1, 2, None, 1],
        "c": [1, None, None, 1],
    }
)
sample_data

a,b,c
i64,i64,i64
,1.0,1.0
,2.0,
,,
3.0,1.0,1.0


In [82]:
# b列が欠損である行を
sample_data.drop_nulls("b")

a,b,c
i64,i64,i64
,1,1.0
,2,
3.0,1,1.0


In [83]:
# 1列でも欠損値を持つ行を削除する
sample_data.drop_nulls()

a,b,c
i64,i64,i64
3,1,1


In [84]:
# すべての列の値が空白である行を削除する
sample_data.filter(~pl.all_horizontal(pl.all().is_null()))

a,b,c
i64,i64,i64
,1,1.0
,2,
3.0,1,1.0


In [85]:
# 全ての行が欠損である列を削除する
sample_data2 = pl.DataFrame(
    {
        "a": [None, None, None],
        "b": [1, 2, None],
        "c": [1, None, None],
    }
)
print(sample_data2)
sample_data2[[s.name for s in sample_data2 if not (s.null_count() == sample_data2.height)]]

shape: (3, 3)
┌──────┬──────┬──────┐
│ a    ┆ b    ┆ c    │
│ ---  ┆ ---  ┆ ---  │
│ null ┆ i64  ┆ i64  │
╞══════╪══════╪══════╡
│ null ┆ 1    ┆ 1    │
│ null ┆ 2    ┆ null │
│ null ┆ null ┆ null │
└──────┴──────┴──────┘


b,c
i64,i64
1.0,1.0
2.0,
,


In [86]:
# 欠損値の補完
sample_data.fill_null(-99)

a,b,c
i64,i64,i64
-99,1,1
-99,2,-99
-99,-99,-99
3,1,1


In [89]:
sample_data.fill_null(strategy="max")

a,b,c
i64,i64,i64
3,1,1
3,2,1
3,2,1
3,1,1


In [98]:
# 今後の処理で欠損値があると良くないので，最初のデータの欠損値を削除する
all_data = all_data.drop_nulls()

### 新しい列の作成
* 列の作成
    * `with_columns()`
    * かなり自由度が高く，四則演算や文字列の扱い，条件分岐もデフォルトの関数で使用可能．
    * 定義済みの関数なためパフォーマンスが非常に高い
    * もちろん，自作関数も使用できるが，パフォーマンスが落ちるため，可能な限り適宜済み関数を用いる
    * 自作関数を用いる場合，`map_elements(my_func, return_dtype=pl.xxx)`を用い

In [99]:
all_data.with_columns(
    # 四則演算や文字列の扱い
    (pl.col("value")*2).alias("value_x2"),
    (pl.col("group1") + "-" + pl.col("group2")).alias("group1_2"),
    (pl.col("value") * pl.col("weight")).alias("value_x_weight"),
).with_columns(
    # 条件分岐
    pl.when(pl.col("group1") == "A")
    .then(pl.col("value"))
    .when(pl.col("group1") == "B")
    .then(pl.col("value") * 100)
    .otherwise(0)
    .alias("foo")
).with_columns(
    # グループごとに統計量を求め，その値を各行に返すことも可能
    pl.col("value").max().over("group1").alias("group1_max")
).with_columns(
    (pl.col("value") - pl.col("group1_max")).alias("value_diff_max_group1")
)

id,group1,group2,value,weight,value_x2,group1_2,value_x_weight,foo,group1_max,value_diff_max_group1
i64,str,str,f64,i64,f64,str,f64,f64,f64,f64
1,"""B""","""Y""",0.177553,9,0.355106,"""B-Y""",1.597975,17.755279,4.710577,-4.533024
2,"""A""","""X""",-0.356422,5,-0.712843,"""A-X""",-1.782109,-0.356422,4.487273,-4.843695
3,"""A""","""Y""",-0.722236,6,-1.444472,"""A-Y""",-4.333415,-0.722236,4.487273,-5.209509
4,"""A""","""X""",0.631321,5,1.262642,"""A-X""",3.156605,0.631321,4.487273,-3.855952
5,"""C""","""X""",-0.667093,5,-1.334186,"""C-X""",-3.335465,0.0,4.478096,-5.145189
…,…,…,…,…,…,…,…,…,…,…
1499996,"""A""","""X""",0.166501,5,0.333003,"""A-X""",0.832507,0.166501,4.487273,-4.320771
1499997,"""B""","""Z""",-1.279135,8,-2.558271,"""B-Z""",-10.233083,-127.913539,4.710577,-5.989712
1499998,"""A""","""Y""",-0.966532,6,-1.933065,"""A-Y""",-5.799194,-0.966532,4.487273,-5.453805
1499999,"""C""","""X""",0.837054,5,1.674108,"""C-X""",4.185271,0.0,4.478096,-3.641042


In [100]:
# 自作関数を用いる場合
def my_func(s: str):
    if s == "X":
        return "XXX"
    elif s == "Y":
        return "YY"
    elif s == "Z":
        return "ZZZZ"
    else:
        "bar"

In [101]:
all_data.with_columns(
    pl.col("group2").map_elements(my_func, return_dtype=pl.String)
)

id,group1,group2,value,weight
i64,str,str,f64,i64
1,"""B""","""YY""",0.177553,9
2,"""A""","""XXX""",-0.356422,5
3,"""A""","""YY""",-0.722236,6
4,"""A""","""XXX""",0.631321,5
5,"""C""","""XXX""",-0.667093,5
…,…,…,…,…
1499996,"""A""","""XXX""",0.166501,5
1499997,"""B""","""ZZZZ""",-1.279135,8
1499998,"""A""","""YY""",-0.966532,6
1499999,"""C""","""XXX""",0.837054,5


### 縦長データ <-> 横長データの変換
* 縦長データ -> 横長データ
    * 
* 横長データ -> 縦長データ
    * 

In [106]:
sample_data = pl.DataFrame(
    {
        "id": [1, 1, 1, 2, 2, 2],
        "cat": ["A", "B", "C", "A", "B", "C"],
        "value1": [10, 12, 14, 16, 18, 20],
        "value2": [1, 2, 4, 6, 8, 10],
    }
)
sample_data

id,cat,value1,value2
i64,str,i64,i64
1,"""A""",10,1
1,"""B""",12,2
1,"""C""",14,4
2,"""A""",16,6
2,"""B""",18,8
2,"""C""",20,10


In [107]:
sample_data.pivot(on="cat", index="id", values=["value1", "value2"])

id,value1_A,value1_B,value1_C,value2_A,value2_B,value2_C
i64,i64,i64,i64,i64,i64,i64
1,10,12,14,1,2,4
2,16,18,20,6,8,10


In [114]:
# ピボット解除
sample_data.unpivot(on=["value1", "value2"], index=["id", "cat"])

id,cat,variable,value
i64,str,str,i64
1,"""A""","""value1""",10
1,"""B""","""value1""",12
1,"""C""","""value1""",14
2,"""A""","""value1""",16
2,"""B""","""value1""",18
…,…,…,…
1,"""B""","""value2""",2
1,"""C""","""value2""",4
2,"""A""","""value2""",6
2,"""B""","""value2""",8


In [110]:
sample_data2 = pl.DataFrame(
    {
        "id": [1, 1, 1, 2, 2, 2],
        "cat": ["A", "A", "C", "A", "B", "C"],
        "value1": [10, -10, 14, 16, 18, 20],
        "value2": [1, 2, 4, 6, 8, 10],
    }
)
sample_data2

id,cat,value1,value2
i64,str,i64,i64
1,"""A""",10,1
1,"""A""",12,2
1,"""C""",14,4
2,"""A""",16,6
2,"""B""",18,8
2,"""C""",20,10


In [113]:
# 重複がある場合， `aggregate_function`に引数を与える
# 計算結果ではなく，どんな値が入っているかを確かめたいときは， `pl.element`とすると，値の配列が得られる
sample_data2.pivot(on="cat", index="id", values=["value1", "value2"], aggregate_function=pl.element())

id,value1_A,value1_C,value1_B,value2_A,value2_C,value2_B
i64,list[i64],list[i64],list[i64],list[i64],list[i64],list[i64]
1,"[10, 12]",[14],,"[1, 2]",[4],
2,[16],[20],[18],[6],[10],[8]
