# Polarsの表現手法

In [1]:
from datetime import date
import os

import polars as pl

## 定数定義

In [2]:
DATA_PAR_PATH = os.path.join('..','..','data')
INPUT_CSV_PATH_FRUIT = os.path.join(DATA_PAR_PATH,'fruit.csv')

## 表現手法のあれこれ

### 表現手法

In [3]:
fruit = pl.read_csv(INPUT_CSV_PATH_FRUIT)
fruit

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


In [4]:
fruit.select(
    pl.col('name'),  # 既存のカラム名を指定
    pl.col('^.*or.*$'),  # 正規表現によってカラム名を指定
    pl.col('weight') / 1000,  # 計算処理も一括で
    'is_round'  # 既存のカラム名を指定（ただし、計算式は適用できない）
)

name,color,origin,weight,is_round
str,str,str,f64,bool
"""Avocado""","""green""","""South America""",0.2,False
"""Banana""","""yellow""","""Asia""",0.12,False
"""Blueberry""","""blue""","""North America""",0.001,False
"""Cantaloupe""","""orange""","""Africa""",2.5,True
"""Cranberry""","""red""","""North America""",0.002,False
"""Elderberry""","""black""","""Europe""",0.001,False
"""Orange""","""orange""","""Asia""",0.13,True
"""Papaya""","""orange""","""South America""",1.0,False
"""Peach""","""orange""","""Asia""",0.15,True
"""Watermelon""","""green""","""Africa""",5.0,True


カラム名を指定するときは、直接文字列で指定しても構わないが、計算処理をしたい時には`pl.col(カラム名)`という具合にしていする必要がある点に注意。  
迷ったら、一旦`pl.col(カラム名)`で指定するようにクセをつけておこう

In [5]:
fruit.with_columns(
    pl.lit(True).alias('is_fruit'),  # `is_fruit`列を全てTrueで追加
    pl.col('name').str.ends_with('berry').alias('is_berry')  # `name`列の末尾がberryの文字列があるか
)

name,weight,color,is_round,origin,is_fruit,is_berry
str,i64,str,bool,str,bool,bool
"""Avocado""",200,"""green""",False,"""South America""",True,False
"""Banana""",120,"""yellow""",False,"""Asia""",True,False
"""Blueberry""",1,"""blue""",False,"""North America""",True,True
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",True,False
"""Cranberry""",2,"""red""",False,"""North America""",True,True
"""Elderberry""",1,"""black""",False,"""Europe""",True,True
"""Orange""",130,"""orange""",True,"""Asia""",True,False
"""Papaya""",1000,"""orange""",False,"""South America""",True,False
"""Peach""",150,"""orange""",True,"""Asia""",True,False
"""Watermelon""",5000,"""green""",True,"""Africa""",True,False


In [6]:
fruit.filter(
    pl.col('is_round') &
    (pl.col('weight') > 1000)
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


pythonの論理演算だと`and`を使うのが普通だし、多言語だとしても`&&`のイメージがあるので、単に`&`なのはちょっと紛らわしいな

In [7]:
fruit.group_by(
    pl.col('origin').str.split(' ').list.last()
).agg(
    pl.len(),
    pl.col('weight').mean().alias('averate_weight')
)

origin,len,averate_weight
str,u32,f64
"""Europe""",1,1.0
"""Asia""",3,133.333333
"""America""",4,300.75
"""Africa""",2,3750.0


polarsでは、選択と集計がひとまとめで実行することができる利点がある

In [8]:
fruit.sort(
    pl.col('name').str.len_bytes(),
    descending=True
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Elderberry""",1,"""black""",False,"""Europe"""
"""Watermelon""",5000,"""green""",True,"""Africa"""
"""Blueberry""",1,"""blue""",False,"""North America"""
"""Cranberry""",2,"""red""",False,"""North America"""
"""Avocado""",200,"""green""",False,"""South America"""
"""Banana""",120,"""yellow""",False,"""Asia"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""


In [9]:
(
    pl.DataFrame({'a': [1, 2, 3], 'b': [0.4, 0.5, 0.6]})
    .with_columns(pl.all().mul(10).name.suffix('_times_10'))
)

a,b,a_times_10,b_times_10
i64,f64,i64,f64
1,0.4,10,4.0
2,0.5,20,5.0
3,0.6,30,6.0


`.suffix(末尾文字列)`で、まとめてカラム名の末尾を付与できるのありがたい

In [10]:
pl.all().mul(10).name.suffix('_times_10').meta.has_multiple_outputs()

True

`.meta.has_multiple_outputs()`は、複数のSeriesによって構築されているか否かを判定してくれる

### 表現特性

In [11]:
is_orange = (pl.col('color') == 'orange').alias('is_orange')

fruit.with_columns(is_orange)

name,weight,color,is_round,origin,is_orange
str,i64,str,bool,str,bool
"""Avocado""",200,"""green""",False,"""South America""",False
"""Banana""",120,"""yellow""",False,"""Asia""",False
"""Blueberry""",1,"""blue""",False,"""North America""",False
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",True
"""Cranberry""",2,"""red""",False,"""North America""",False
"""Elderberry""",1,"""black""",False,"""Europe""",False
"""Orange""",130,"""orange""",True,"""Asia""",True
"""Papaya""",1000,"""orange""",False,"""South America""",True
"""Peach""",150,"""orange""",True,"""Asia""",True
"""Watermelon""",5000,"""green""",True,"""Africa""",False


`.alias(列名)`で新規カラムの列名を命名できるの、めちゃくちゃ楽だなぁ、といまさら感じ始めてきた

In [12]:
fruit.filter(is_orange)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Orange""",130,"""orange""",True,"""Asia"""
"""Papaya""",1000,"""orange""",False,"""South America"""
"""Peach""",150,"""orange""",True,"""Asia"""


In [13]:
fruit.group_by(is_orange).len()

is_orange,len
bool,u32
False,6
True,4


新しいpolarsだと`.count()`じゃなくて`.len()`が推奨されているわけだが、`.count()`のままでよかった気がする..

In [14]:
flowers = pl.DataFrame({
    'name': ['Tiger lily', 'Blue flag', 'African marigold'],
    'latin': ['Lilium columbianum', 'Iris vesicolor', 'Tagetes erecta'],
    'color': ['orange', 'purple', 'orange']
})

flowers.filter(is_orange)

name,latin,color
str,str,str
"""Tiger lily""","""Lilium columbianum""","""orange"""
"""African marigold""","""Tagetes erecta""","""orange"""


なるほどなぁ。  
以前のセルで`is_orange`を定義したけれど、条件式の定義は遅延実行されるのか。なので、別のDataFrameに対して適用した際に、そのDataFrameに合わせた形で適用できるのか。  
pandasだと、1つのDataFrameに対して都度定義する必要があるので、メモリ効率も良くなりそう

### 表現作成

In [15]:
fruit.select(pl.col('color')).columns

['color']

In [16]:
fruit.select(pl.col('^.*or.*$')).columns

['color', 'origin']

In [17]:
fruit.select(pl.all()).columns

['name', 'weight', 'color', 'is_round', 'origin']

In [18]:
fruit.select(pl.col(pl.Boolean, pl.Int64)).columns

['weight', 'is_round']

プリミティブ型でのカラム選択ができるのいいね

In [19]:
fruit.select(pl.col([pl.String, 'is_round'])).columns

TypeError: argument 'dtypes': 'str' is not a Polars data type

上記エラーの通りで、カラム名選択時に文字列型とデータ型を同時に指定することはできない

既存のDataFrameではなく、新規でselectすることも可能

In [20]:
pl.select(pl.lit(42))

literal
i32
42


In [21]:
pl.select(pl.lit(42).alias('answer'))

answer
i32
42


In [22]:
fruit.with_columns(pl.lit('Earth').alias('planet'))

name,weight,color,is_round,origin,planet
str,i64,str,bool,str,str
"""Avocado""",200,"""green""",False,"""South America""","""Earth"""
"""Banana""",120,"""yellow""",False,"""Asia""","""Earth"""
"""Blueberry""",1,"""blue""",False,"""North America""","""Earth"""
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","""Earth"""
"""Cranberry""",2,"""red""",False,"""North America""","""Earth"""
"""Elderberry""",1,"""black""",False,"""Europe""","""Earth"""
"""Orange""",130,"""orange""",True,"""Asia""","""Earth"""
"""Papaya""",1000,"""orange""",False,"""South America""","""Earth"""
"""Peach""",150,"""orange""",True,"""Asia""","""Earth"""
"""Watermelon""",5000,"""green""",True,"""Africa""","""Earth"""


In [23]:
fruit.with_columns(pl.lit([False, True]).alias('row_is_even'))

name,weight,color,is_round,origin,row_is_even
str,i64,str,bool,str,list[bool]
"""Avocado""",200,"""green""",False,"""South America""","[false, true]"
"""Banana""",120,"""yellow""",False,"""Asia""","[false, true]"
"""Blueberry""",1,"""blue""",False,"""North America""","[false, true]"
"""Cantaloupe""",2500,"""orange""",True,"""Africa""","[false, true]"
"""Cranberry""",2,"""red""",False,"""North America""","[false, true]"
"""Elderberry""",1,"""black""",False,"""Europe""","[false, true]"
"""Orange""",130,"""orange""",True,"""Asia""","[false, true]"
"""Papaya""",1000,"""orange""",False,"""South America""","[false, true]"
"""Peach""",150,"""orange""",True,"""Asia""","[false, true]"
"""Watermelon""",5000,"""green""",True,"""Africa""","[false, true]"


In [24]:
pl.select(
    pl.repeat('Ello', 3).alias('hello'),
    pl.zeros(3),
    pl.ones(3)
)

hello,zeros,ones
str,f64,f64
"""Ello""",0.0,1.0
"""Ello""",0.0,1.0
"""Ello""",0.0,1.0


In [25]:
pl.select(
    pl.int_range(0, 5).alias('start'),
    pl.arange(0, 10, 2).pow(2).alias('end')
).with_columns(
    pl.int_ranges('start', 'end').alias('int_range')
).with_columns(
    pl.col('int_range').list.len().alias('range_length')
)

start,end,int_range,range_length
i64,i64,list[i64],u32
0,0,[],0
1,4,"[1, 2, 3]",3
2,16,"[2, 3, … 15]",14
3,36,"[3, 4, … 35]",33
4,64,"[4, 5, … 63]",60


他のカラムのデータを使って、同時に計算処理だけでなくlistの作成とかもできるのか。  
polarsに慣れると便利だろうけれど、できることが多すぎて覚えることも多いので、pandasから移行しないと少し大変かも？

In [26]:
pl.select(
    pl.date_range(date(1985, 10, 21), date(1985, 10, 26)).alias('start'),
    pl.repeat(date(2021, 10, 21), 6).alias('end')
).with_columns(
    pl.datetime_ranges('start', 'end', interval='1h').alias('range')
)

start,end,range
date,date,list[datetime[μs]]
1985-10-21,2021-10-21,"[1985-10-21 00:00:00, 1985-10-21 01:00:00, … 2021-10-21 00:00:00]"
1985-10-22,2021-10-21,"[1985-10-22 00:00:00, 1985-10-22 01:00:00, … 2021-10-21 00:00:00]"
1985-10-23,2021-10-21,"[1985-10-23 00:00:00, 1985-10-23 01:00:00, … 2021-10-21 00:00:00]"
1985-10-24,2021-10-21,"[1985-10-24 00:00:00, 1985-10-24 01:00:00, … 2021-10-21 00:00:00]"
1985-10-25,2021-10-21,"[1985-10-25 00:00:00, 1985-10-25 01:00:00, … 2021-10-21 00:00:00]"
1985-10-26,2021-10-21,"[1985-10-26 00:00:00, 1985-10-26 01:00:00, … 2021-10-21 00:00:00]"


integer型だけじゃなくて、datetime系の型でもrangeが使えるのか。  
便利な気がするけれど、出番がパッと思いつかないな..。

### 名称変更

In [27]:
df = pl.DataFrame({'text': 'value', 'An integer': 5040, 'BOOLEAN': True})
df

text,An integer,BOOLEAN
str,i64,bool
"""value""",5040,True


In [28]:
df.select(
    pl.col('text').name.to_uppercase(),
    pl.col('An integer').alias('int'),
    pl.col('BOOLEAN').name.to_lowercase()
)

TEXT,int,boolean
str,i64,bool
"""value""",5040,True


今のpolarsだと、命名操作は1つの式の中で、1つのカラムに対して1回しか許可されてない。  
まぁ、何回もカラム名を変えるって操作そんなに出番なさそう

In [29]:
df.select(
    pl.all()
    .name.map(lambda s: s.lower().replace(' ', ''))
)

text,aninteger,boolean
str,i64,bool
"""value""",5040,True


In [30]:
fruit.filter(
    (fruit['weight'] > 1000) & fruit['is_round']
)

name,weight,color,is_round,origin
str,i64,str,bool,str
"""Cantaloupe""",2500,"""orange""",True,"""Africa"""
"""Watermelon""",5000,"""green""",True,"""Africa"""


`pl.col('weight')`みたいなpolarsっぽい記法でももちろん実行できるけれど、pandasみたいに`fruit['weight']`っていう`[`、`]`もサポートしているのね。  
その代わり、polars特有の最適化が適用できない

In [31]:
(
    fruit
    .lazy()
    .filter((pl.col('weight') > 1000) & pl.col('is_round'))
    .with_columns(pl.col('name').str.ends_with('berry').alias('is_berry'))
    .collect()
)

name,weight,color,is_round,origin,is_berry
str,i64,str,bool,str,bool
"""Cantaloupe""",2500,"""orange""",True,"""Africa""",False
"""Watermelon""",5000,"""green""",True,"""Africa""",False


In [32]:
(
    fruit
    .lazy()
    .filter((fruit['weight'] > 1000) & fruit['is_round'])
    .with_columns(fruit['name'].str.ends_with('berry').alias('is_berry'))
    .collect()
)

ShapeError: unable to add a column of length 10 to a DataFrame of height 2

pandasの記法で最適化を実施する際に上記エラーが出る理由は、直接的にDataFrameを指定しているため、`.filter()`によってフィルタリングされた結果と`.with_columns()`にて追加しようとしている`fruit['name']`のデータ数が不一致になるためである。  
polarsでは`pl.col('カラム名')`の形式で指定するため、最適化する際にデータの処理が可変してくれる（それを可変してくれる点を含めて最適化）