# Polarsの継続表現手法
直訳で継続表現としているが、`pl.Expr`に続くモジュールのことを指していると思われる。  
なので、本notebookでは`pl.Expr`に対して使用可能な様々なモジュールに関して色々操作する

In [1]:
import math
import numpy as np
import os

import polars as pl

## 定数定義

In [2]:
DATA_PAR_PATH = os.path.join('..','..','data')
INPUT_CSV_PATH_PENGUINS = os.path.join(DATA_PAR_PATH,'penguins.csv')

## 継続表現の例

In [3]:
print(f"{math.pi = }")
rng = np.random.default_rng(1729)
print(f"{rng.random() = }")

math.pi = 3.141592653589793
rng.random() = 0.03074202960516803


In [4]:
penguins  = (
    pl.read_csv(INPUT_CSV_PATH_PENGUINS, null_values='NA')
    .select(
        'species',
        'island',
        'sex',
        'year',
        pl.col('body_mass_g').alias('mass') / 1000
    ))

penguins.with_columns(
    pl.col('mass').sqrt().alias('mass_sqrt'),
    pl.col('mass').interpolate().alias('mass_filled')
)

species,island,sex,year,mass,mass_sqrt,mass_filled
str,str,str,i64,f64,f64,f64
"""Adelie""","""Torgersen""","""male""",2007,3.75,1.936492,3.75
"""Adelie""","""Torgersen""","""female""",2007,3.8,1.949359,3.8
"""Adelie""","""Torgersen""","""female""",2007,3.25,1.802776,3.25
"""Adelie""","""Torgersen""",,2007,,,3.35
"""Adelie""","""Torgersen""","""female""",2007,3.45,1.857418,3.45
…,…,…,…,…,…,…
"""Chinstrap""","""Dream""","""male""",2009,4.0,2.0,4.0
"""Chinstrap""","""Dream""","""female""",2009,3.4,1.843909,3.4
"""Chinstrap""","""Dream""","""male""",2009,3.775,1.942936,3.775
"""Chinstrap""","""Dream""","""male""",2009,4.1,2.024846,4.1


In [5]:
penguins.select(
    pl.col('mass').mean(),
    pl.col('island').mode().first()
)

mass,island
f64,str
4.201754,"""Biscoe"""


In [6]:
penguins.select(
    pl.col('island').unique()
)

island
str
"""Biscoe"""
"""Dream"""
"""Torgersen"""


In [7]:
penguins.select(
    pl.col('species')
    .unique()
    .repeat_by(3000)
    .explode()  # ここでlistを分解して、Series型に適用できるようにしている
    .extend_constant('Saiyan', n=1)
)

species
str
"""Adelie"""
"""Adelie"""
"""Adelie"""
"""Adelie"""
"""Adelie"""
…
"""Chinstrap"""
"""Chinstrap"""
"""Chinstrap"""
"""Chinstrap"""


## 要素ごとの操作

In [8]:
(
    pl.DataFrame({'x': [-2.0, 0.0, 0.5, 1.0, math.e, 1000.0]})  # 全て同じ型じゃないとエラーになる
    .with_columns(
        abs=pl.col('x').abs(),
        exp=pl.col('x').exp(),
        log2=pl.col('x').log(2),
        log10=pl.col('x').log10(),
        log1p=pl.col('x').log1p(),
        sign=pl.col('x').sign(),
        sqrt=pl.col('x').sqrt(),
    )
)

x,abs,exp,log2,log10,log1p,sign,sqrt
f64,f64,f64,f64,f64,f64,f64,f64
-2.0,2.0,0.135335,,,,-1.0,
0.0,0.0,1.0,-inf,-inf,0.0,0.0,0.0
0.5,0.5,1.648721,-1.0,-0.30103,0.405465,1.0,0.707107
1.0,1.0,2.718282,0.0,0.0,0.693147,1.0,1.0
2.718282,2.718282,15.154262,1.442695,0.434294,1.313262,1.0,1.648721
1000.0,1000.0,inf,9.965784,3.0,6.908755,1.0,31.622777


In [9]:
(
    pl.DataFrame({'x': [-math.pi, 0.0, 1, math.pi, 2*math.pi, 90.0, 180.0, 360.0]})
    .with_columns(
        arccos=pl.col('x').arccos(),
        cos=pl.col('x').cos(),
        defrees=pl.col('x').degrees(),
        radians=pl.col('x').radians(),
        sin=pl.col('x').sin()
    )
)

x,arccos,cos,defrees,radians,sin
f64,f64,f64,f64,f64,f64
-3.141593,,-1.0,-180.0,-0.054831,-1.2246e-16
0.0,1.570796,1.0,0.0,0.0,0.0
1.0,0.0,0.540302,57.29578,0.017453,0.841471
3.141593,,-1.0,180.0,0.054831,1.2246e-16
6.283185,,1.0,360.0,0.109662,-2.4493e-16
90.0,,-0.448074,5156.620156,1.570796,0.893997
180.0,,-0.59846,10313.240312,3.141593,-0.801153
360.0,,-0.283691,20626.480625,6.283185,0.958916


In [10]:
(
    pl.DataFrame({'x': [-6.0, -0.5, 0.0, 0.5, math.pi, 9.9, 9.99, 9.999]})
    .with_columns(
        ceil=pl.col('x').ceil(),
        clip=pl.col('x').clip(-1, 1),
        cut=pl.col('x').cut([-1, 1]), labels=['bad', 'neutral', 'good'],
        floor=pl.col('x').floor(),
        qcut=pl.col('x').qcut([0.5], labels=['below median', 'above median']),
        round2=pl.col('x').round(2),
        round0=pl.col('x').round(0)
    )
)

x,ceil,clip,cut,labels,floor,qcut,round2,round0
f64,f64,f64,cat,list[str],f64,cat,f64,f64
-6.0,-6.0,-1.0,"""(-inf, -1]""","[""bad"", ""neutral"", ""good""]",-6.0,"""below median""",-6.0,-6.0
-0.5,-0.0,-0.5,"""(-1, 1]""","[""bad"", ""neutral"", ""good""]",-1.0,"""below median""",-0.5,-1.0
0.0,0.0,0.0,"""(-1, 1]""","[""bad"", ""neutral"", ""good""]",0.0,"""below median""",0.0,0.0
0.5,1.0,0.5,"""(-1, 1]""","[""bad"", ""neutral"", ""good""]",0.0,"""below median""",0.5,1.0
3.141593,4.0,1.0,"""(1, inf]""","[""bad"", ""neutral"", ""good""]",3.0,"""above median""",3.14,3.0
9.9,10.0,1.0,"""(1, inf]""","[""bad"", ""neutral"", ""good""]",9.0,"""above median""",9.9,10.0
9.99,10.0,1.0,"""(1, inf]""","[""bad"", ""neutral"", ""good""]",9.0,"""above median""",9.99,10.0
9.999,10.0,1.0,"""(1, inf]""","[""bad"", ""neutral"", ""good""]",9.0,"""above median""",10.0,10.0


In [11]:
x = [42.0, math.nan, None, math.inf, -math.inf]

(
    pl.DataFrame({'x': x})
    .with_columns(
        fill_nan=pl.col('x').fill_nan(999),
        fill_null=pl.col('x').fill_null(0),
        is_finite=pl.col('x').is_finite(),
        is_infinite=pl.col('x').is_infinite(),
        is_nan=pl.col('x').is_nan(),
        is_null=pl.col('x').is_null()
    )
)

x,fill_nan,fill_null,is_finite,is_infinite,is_nan,is_null
f64,f64,f64,bool,bool,bool,bool
42.0,42.0,42.0,True,False,False,False
,999.0,,False,False,True,False
,,0.0,,,,True
inf,inf,inf,False,True,False,False
-inf,-inf,-inf,False,True,False,False


`nan`と`null`は別物だよ、ってことを再認識。  
あと、`.is_finite() / .is_infinite()`があるの面白いなぁ。でも、`.fill_finite() / .fill_infinite()`は用意されてないんだね

In [12]:
(
    pl.DataFrame({'x': x})
    .with_columns(
        fill_both=pl.col('x').fill_nan(0).fill_null(0),
        is_either=(
            pl.col('x').is_nan() | pl.col('x').is_null()
        )
    )
)

x,fill_both,is_either
f64,f64,bool
42.0,42.0,False
,0.0,True
,0.0,True
inf,inf,False
-inf,-inf,False


## その他の操作

In [13]:
(
    pl.DataFrame({'x': ['here', 'there', 'their', "they're"]})
    .with_columns(
        hash=pl.col('x').hash(seed=1337),
        repeat_by=pl.col('x').repeat_by(3),
        replace=pl.col('x').replace({
            'here': 'there',
            "they're": 'they are'
        })
    )
)

x,hash,repeat_by,replace
str,u64,list[str],str
"""here""",12695211751326448172,"[""here"", ""here"", ""here""]","""there"""
"""there""",17329794691236705436,"[""there"", ""there"", ""there""]","""there"""
"""their""",2663095961041830581,"[""their"", ""their"", ""their""]","""their"""
"""they're""",6743063676290245144,"[""they're"", ""they're"", ""they're""]","""they are"""


`.replace()`あるなら、さっきの`.fill_finite() / .fill_infinite()`的な操作もこれでできるな

## Sereis型の操作

In [14]:
(
    pl.DataFrame({'x': [0.0, 1.0, 2.0, None, 2.0, np.nan, -1.0, 2.0]})
    .with_columns(
        cum_count=pl.col('x').cum_count(),
        cum_max=pl.col('x').cum_max(),
        cum_min=pl.col('x').cum_min(),
        cum_prod=pl.col('x').cum_prod(reverse=True),
        cum_sum=pl.col('x').cum_sum(),
        diff=pl.col('x').diff(),
        pct_change=pl.col('x').pct_change()
    )
)

x,cum_count,cum_max,cum_min,cum_prod,cum_sum,diff,pct_change
f64,u32,f64,f64,f64,f64,f64,f64
0.0,1,0.0,0.0,,0.0,,
1.0,2,1.0,0.0,,1.0,1.0,inf
2.0,3,2.0,0.0,,3.0,1.0,1.0
,3,,,,,,0.0
2.0,4,2.0,0.0,,5.0,,0.0
,5,2.0,0.0,,,,
-1.0,6,2.0,-1.0,-2.0,,,
2.0,7,2.0,-1.0,2.0,,3.0,-3.0


`cum_prod`列でNaNが変な気がしたが、reverese=Trueを入れてるから、累積の計算順が逆転している（=後ろから計算）からこんな結果になっているのか。

## 補完とシフト操作

In [15]:
(
    pl.DataFrame({'x': [-1.0, 0.0, 1.0, None, None, 3.0, 4.0, math.nan, 6.0]})
    .with_columns(
        backward_fill=pl.col('x').backward_fill(),
        forward_fill=pl.col('x').forward_fill(limit=1),
        interp1=pl.col('x').interpolate(method='linear'),
        interp2=pl.col('x').interpolate(method='nearest'),
        shift1=pl.col('x').shift(1),
        shift2=pl.col('x').shift(-2)
    )
)

x,backward_fill,forward_fill,interp1,interp2,shift1,shift2
f64,f64,f64,f64,f64,f64,f64
-1.0,-1.0,-1.0,-1.0,-1.0,,1.0
0.0,0.0,0.0,0.0,0.0,-1.0,
1.0,1.0,1.0,1.0,1.0,0.0,
,3.0,1.0,1.666667,1.0,1.0,3.0
,3.0,,2.333333,3.0,,4.0
3.0,3.0,3.0,3.0,3.0,,
4.0,4.0,4.0,4.0,4.0,3.0,6.0
,,,,,4.0,
6.0,6.0,6.0,6.0,6.0,,


`NaN`はfillの対象外になるので、補完されない

## 重複値の操作

In [16]:
(
    pl.DataFrame({'x': ['A', 'C', 'D', 'C']})
    .with_columns(
        is_duplicated=pl.col('x').is_duplicated(),
        is_first_distinct=pl.col('x').is_first_distinct(),
        is_last_distinct=pl.col('x').is_last_distinct(),
        is_unique=pl.col('x').is_unique(),
    )
)

x,is_duplicated,is_first_distinct,is_last_distinct,is_unique
str,bool,bool,bool,bool
"""A""",False,True,True,True
"""C""",True,True,False,False
"""D""",False,True,True,True
"""C""",True,False,True,False


## ローリング統計の操作

やる量が多過ぎるので、一旦このnotebookはスキップする。あとでどっかのタイミングで戻ってくる。  
真ん中らへんから再開予定