# 列の選択と作成

In [1]:
from datetime import datetime
import os

import polars as pl
import polars.selectors as cs

## 定数定義

In [2]:
DATA_PAR_PATH = os.path.join('..','..','data')
INPUT_CSV_PATH_STARWARS = os.path.join(DATA_PAR_PATH,'starwars.parquet')

## 手始め

In [3]:
starwars = pl.read_parquet(INPUT_CSV_PATH_STARWARS)

In [4]:
rebels = (
    starwars
    .drop('films')
    .filter(pl.col('name').is_in(['Luke Skywalker', 'Leia Orange', 'Han Solo']))
)

print(rebels[:, :6])
print(rebels[:, 6:11])
print(rebels[:, 11:])

shape: (2, 6)
┌────────────────┬────────┬──────┬────────────┬────────────┬───────────┐
│ name           ┆ height ┆ mass ┆ hair_color ┆ skin_color ┆ eye_color │
│ ---            ┆ ---    ┆ ---  ┆ ---        ┆ ---        ┆ ---       │
│ str            ┆ u16    ┆ f64  ┆ str        ┆ str        ┆ str       │
╞════════════════╪════════╪══════╪════════════╪════════════╪═══════════╡
│ Han Solo       ┆ 180    ┆ 80.0 ┆ brown      ┆ fair       ┆ brown     │
│ Luke Skywalker ┆ 172    ┆ 77.0 ┆ blond      ┆ fair       ┆ blue      │
└────────────────┴────────┴──────┴────────────┴────────────┴───────────┘
shape: (2, 5)
┌────────────┬──────┬───────────┬───────────┬─────────┐
│ birth_year ┆ sex  ┆ gender    ┆ homeworld ┆ species │
│ ---        ┆ ---  ┆ ---       ┆ ---       ┆ ---     │
│ f64        ┆ cat  ┆ cat       ┆ str       ┆ str     │
╞════════════╪══════╪═══════════╪═══════════╪═════════╡
│ 29.0       ┆ male ┆ masculine ┆ Corellia  ┆ Human   │
│ 19.0       ┆ male ┆ masculine ┆ Tatooine  ┆ Human 

## 列の選択

### 文字列型に基づいた選択手法

In [5]:
rebels.select(
    'name',
    pl.col('homeworld'),
    pl.col('^.*_color$'),
    (pl.col('height') / 100).alias('height_m')
)

name,homeworld,hair_color,skin_color,eye_color,height_m
str,str,str,str,str,f64
"""Han Solo""","""Corellia""","""brown""","""fair""","""brown""",1.8
"""Luke Skywalker""","""Tatooine""","""blond""","""fair""","""blue""",1.72


In [6]:
rebels.select(
    'name',
    cs.by_name('homeworld'),
    cs.by_name('^.*_color$'),
    (cs.by_name('height') / 100).alias('height_m')
)

name,homeworld,hair_color,skin_color,eye_color,height_m
str,str,str,str,str,f64
"""Han Solo""","""Corellia""","""brown""","""fair""","""brown""",1.8
"""Luke Skywalker""","""Tatooine""","""blond""","""fair""","""blue""",1.72


上のセルでセレクター（=`cs`）を使っているけれど、従来の`pl.col()`と同じように利用できる。  
以降で、セレクターの特徴を列挙する

In [7]:
rebels.select(cs.starts_with('birth_'))

birth_year,birth_date
f64,date
29.0,1948-06-01
19.0,1958-05-30


In [8]:
rebels.select(cs.ends_with('_color'))

hair_color,skin_color,eye_color
str,str,str
"""brown""","""fair""","""brown"""
"""blond""","""fair""","""blue"""


In [9]:
rebels.select(cs.contains('_'))

hair_color,skin_color,eye_color,birth_year,birth_date,screen_time
str,str,str,f64,date,duration[μs]
"""brown""","""fair""","""brown""",29.0,1948-06-01,1h 12m 37s
"""blond""","""fair""","""blue""",19.0,1958-05-30,1h 58m 44s


セレクターを使うと、一致する単語に関する列を抽出する操作が一気に楽になるのか

In [10]:
rebels.select(cs.matches('^[a-z]{4}$'))

name,mass
str,f64
"""Han Solo""",80.0
"""Luke Skywalker""",77.0


### データ型に基づいた選択手法

In [11]:
rebels.group_by('hair_color').agg(cs.numeric().mean())

hair_color,height,mass,birth_year
str,f64,f64,f64
"""blond""",172.0,77.0,19.0
"""brown""",180.0,80.0,29.0


In [12]:
rebels.select(cs.string())

name,hair_color,skin_color,eye_color,homeworld,species
str,str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""Corellia""","""Human"""
"""Luke Skywalker""","""blond""","""fair""","""blue""","""Tatooine""","""Human"""


In [13]:
rebels.select(cs.temporal())

birth_date,screen_time
date,duration[μs]
1948-06-01,1h 12m 37s
1958-05-30,1h 58m 44s


`cs.temporal()`はdatetimeやdateといった時間系のデータ型を対象に抽出する

In [14]:
rebels.select(cs.by_dtype(pl.List(pl.String)))

vehicles,starships
list[str],list[str]
,"[""Millennium Falcon"", ""Imperial shuttle""]"
"[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]"


ネストされたデータに対しては、ネストされたデータの要素の型も指定してあげないといけない

### ポジションに基づいた選択手法

In [15]:
rebels.select(cs.by_index(range(0, 999, 3)))

name,hair_color,birth_year,homeworld,starships
str,str,f64,str,list[str]
"""Han Solo""","""brown""",29.0,"""Corellia""","[""Millennium Falcon"", ""Imperial shuttle""]"
"""Luke Skywalker""","""blond""",19.0,"""Tatooine""","[""X-wing"", ""Imperial shuttle""]"


In [16]:
rebels.select('name', cs.by_index(range(-2, 0)))

name,birth_date,screen_time
str,date,duration[μs]
"""Han Solo""",1948-06-01,1h 12m 37s
"""Luke Skywalker""",1958-05-30,1h 58m 44s


### セレクターの組み合わせ

In [17]:
rebels.select(cs.by_name('hair_color') | cs.numeric())

height,mass,hair_color,birth_year
u16,f64,str,f64
180,80.0,"""brown""",29.0
172,77.0,"""blond""",19.0


通常のpolarsだと、列名を指定するときは、文字列 or データ型のいずれかのみにする必要があって、両方で指定しようとするとエラーになっていた。  
でも、セレクターを使うとそれが実現できる。その代わり（？）、セレクターの`cs.by_name()`を使った文字列の指定が条件（`pl.col()`による指定はだめ）

In [18]:
df = pl.DataFrame({'d': 1, 'i': True, 's': True, 'c': True, 'o': 1.0})

print(df)

x = cs.by_name('d', 'i', 's')
y = cs.boolean()

print('\nselector => columns')

for s in ['x', 'y', 'x | y', 'x & y', 'x - y', 'x ^ y', '~x', 'x - x']:
    print(f"{s:8} => {cs.expand_selector(df, eval(s))}")

shape: (1, 5)
┌─────┬──────┬──────┬──────┬─────┐
│ d   ┆ i    ┆ s    ┆ c    ┆ o   │
│ --- ┆ ---  ┆ ---  ┆ ---  ┆ --- │
│ i64 ┆ bool ┆ bool ┆ bool ┆ f64 │
╞═════╪══════╪══════╪══════╪═════╡
│ 1   ┆ true ┆ true ┆ true ┆ 1.0 │
└─────┴──────┴──────┴──────┴─────┘

selector => columns
x        => ('d', 'i', 's')
y        => ('i', 's', 'c')
x | y    => ('d', 'i', 's', 'c')
x & y    => ('i', 's')
x - y    => ('d',)
x ^ y    => ('d', 'c')
~x       => ('c', 'o')
x - x    => ()


In [19]:
print(df.select(first := cs.by_name('c', 'i'), ~first))
print(df.select(first := cs.last(), ~first))

shape: (1, 5)
┌──────┬──────┬─────┬──────┬─────┐
│ c    ┆ i    ┆ d   ┆ s    ┆ o   │
│ ---  ┆ ---  ┆ --- ┆ ---  ┆ --- │
│ bool ┆ bool ┆ i64 ┆ bool ┆ f64 │
╞══════╪══════╪═════╪══════╪═════╡
│ true ┆ true ┆ 1   ┆ true ┆ 1.0 │
└──────┴──────┴─────┴──────┴─────┘
shape: (1, 5)
┌─────┬─────┬──────┬──────┬──────┐
│ o   ┆ d   ┆ i    ┆ s    ┆ c    │
│ --- ┆ --- ┆ ---  ┆ ---  ┆ ---  │
│ f64 ┆ i64 ┆ bool ┆ bool ┆ bool │
╞═════╪═════╪══════╪══════╪══════╡
│ 1.0 ┆ 1   ┆ true ┆ true ┆ true │
└─────┴─────┴──────┴──────┴──────┘


## 列の作成

In [20]:
rebels.with_columns(bmi=pl.col('mass') / ((pl.col('height') // 100) ** 2))

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,80.0
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,77.0


In [21]:
rebels.with_columns(
    bmi=pl.col('mass') / ((pl.col('height') / 100) ** 2),
    age_destroy=((datetime(1983, 5, 25) - pl.col('birth_date'))
                 .dt.total_days() / 356).cast(pl.UInt8)
)

name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time,bmi,age_destroy
str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs],f64,u8
"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s,24.691358,35
"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s,26.027582,25


In [22]:
(
    starwars
    .select(
        'name',
        (pl.col('mass') / ((pl.col('height') / 100) ** 2)).alias('bmi'),
        'species'
    )
    .drop_nulls().top_k(5, by='bmi')
)

name,bmi,species
str,f64,str
"""Jabba Desilijic Tiure""",443.428571,"""Hutt"""
"""Dud Bolt""",50.928022,"""Vulptereen"""
"""Yoda""",39.02663,"""Yoda's species"""
"""Owen Lars""",37.874006,"""Human"""
"""IG-88""",35.0,"""Droid"""


## 列の操作

In [23]:
rebels.drop('name', 'films', 'screen_time', strict=False)

height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date
u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date
180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01
172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30


`strict=False`を指定することによって、存在しない列名を指定したとしてもエラーにならない。  
便利かもしれないけれど、意識して存在する列名を指定した方がミスが少ないはずなので、これに頼りすぎないようにしよう

In [24]:
(
    rebels
    .rename({'homeworld': 'planet', 'mass': 'weight'})
    .rename(lambda s: s.removesuffix('_color'))
    .select('name', 'planet', 'weight', 'hair', 'skin', 'eye')
)

name,planet,weight,hair,skin,eye
str,str,f64,str,str,str
"""Han Solo""","""Corellia""",80.0,"""brown""","""fair""","""brown"""
"""Luke Skywalker""","""Tatooine""",77.0,"""blond""","""fair""","""blue"""


In [25]:
rebel_names = rebels.select('name')
rebel_colors = rebels.select(cs.ends_with('_color'))
rebel_quotes = pl.Series('quote', ['You know, sometimes I amaze myself.',
                                   'I have a bad feeling about this.'])

(
    rebel_names
    .hstack(rebel_colors)
    .hstack([rebel_quotes])
)

name,hair_color,skin_color,eye_color,quote
str,str,str,str,str
"""Han Solo""","""brown""","""fair""","""brown""","""You know, sometimes I amaze my…"
"""Luke Skywalker""","""blond""","""fair""","""blue""","""I have a bad feeling about thi…"


`.with_columns()`を使って同時に式を作っても良いが、別々のDataFrame(or Series)型をスタックして結合することができる

In [26]:
rebels.with_row_index(name='rebel_id', offset=1)

rebel_id,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,vehicles,starships,birth_date,screen_time
u32,str,u16,f64,str,str,str,f64,cat,cat,str,str,list[str],list[str],date,duration[μs]
1,"""Han Solo""",180,80.0,"""brown""","""fair""","""brown""",29.0,"""male""","""masculine""","""Corellia""","""Human""",,"[""Millennium Falcon"", ""Imperial shuttle""]",1948-06-01,1h 12m 37s
2,"""Luke Skywalker""",172,77.0,"""blond""","""fair""","""blue""",19.0,"""male""","""masculine""","""Tatooine""","""Human""","[""Snowspeeder"", ""Imperial Speeder Bike""]","[""X-wing"", ""Imperial shuttle""]",1958-05-30,1h 58m 44s


（余談 / 気付き）  
polarsのメソッドで、`.with_~~`みたいな感じでwithから始まる系のメソッドは追加する操作なのかもしれない