## 4.2 複数の変数を含む列があるとき
Pivotテーブルの列名が複数の変数を結合した文字列の場合

In [12]:
import polars as pl
pl.Config.set_tbl_cols(-1)# 列が省略されないようにする

polars.config.Config

### データを読み込む

In [13]:
# 列名が 状態_国 で書かれている
# e.g
# Cases_Guinea: ギアナの患者数
# Deaths_Guinea: ギアナの死者数
ebola = pl.read_csv("../data/country_timeseries.csv")
ebola.head()

Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""1/5/2015""",289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
"""1/4/2015""",288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
"""1/3/2015""",287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
"""1/2/2015""",286,,8157.0,,,,,,,,3496.0,,,,,,
"""12/31/2014""",284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


### データを変換する

#### 縦持ちに変換する

In [14]:
ebola_long = ebola.melt(id_vars = ["Date", "Day"])
ebola_long.head()

Date,Day,variable,value
str,i64,str,i64
"""1/5/2015""",289,"""Cases_Guinea""",2776.0
"""1/4/2015""",288,"""Cases_Guinea""",2775.0
"""1/3/2015""",287,"""Cases_Guinea""",2769.0
"""1/2/2015""",286,"""Cases_Guinea""",
"""12/31/2014""",284,"""Cases_Guinea""",2730.0


#### 結合された列を分割する

In [15]:
# 結合された列を分割する
df_variable_split = (
    ebola_long["variable"]
    .str.split_exact("_", 1)# 文字列を分割して、複数列のDataFrameに変換する
    .struct.rename_fields(["status", "country"])# 分割DataFrameの列名を変換する
    .alias("fields")# 列名を仮置きする
    .to_frame().unnest("fields")# 分割DataFrameを普通のDataFRameに変換する
)

# 分割した列を追加する
ebola_long = pl.concat(items = [ebola_long, df_variable_split], how = "horizontal")

ebola_long.head()

Date,Day,variable,value,status,country
str,i64,str,i64,str,str
"""1/5/2015""",289,"""Cases_Guinea""",2776.0,"""Cases""","""Guinea"""
"""1/4/2015""",288,"""Cases_Guinea""",2775.0,"""Cases""","""Guinea"""
"""1/3/2015""",287,"""Cases_Guinea""",2769.0,"""Cases""","""Guinea"""
"""1/2/2015""",286,"""Cases_Guinea""",,"""Cases""","""Guinea"""
"""12/31/2014""",284,"""Cases_Guinea""",2730.0,"""Cases""","""Guinea"""


### パイプラインを作成する
上記の内容を1セルにまとめる

In [16]:
import polars as pl
pl.Config.set_tbl_cols(-1)

# データを読み込む
ebola = pl.read_csv("../data/country_timeseries.csv")


ebola_long = (
    ebola
    # 縦持ちに変換する
    .melt(id_vars = ["Date", "Day"])
    # 状態_国 の列を2つの列に分割して、DataFrameに追加する
    .with_columns(
        pl.col("variable")
        .str.split_exact("_", 1)
        .struct.rename_fields(["status", "country"])
        .alias("fields")
    ).unnest("fields")# pl.col()を用いる場合、unnestを最後につける
)

ebola_long.head()

Date,Day,variable,value,status,country
str,i64,str,i64,str,str
"""1/5/2015""",289,"""Cases_Guinea""",2776.0,"""Cases""","""Guinea"""
"""1/4/2015""",288,"""Cases_Guinea""",2775.0,"""Cases""","""Guinea"""
"""1/3/2015""",287,"""Cases_Guinea""",2769.0,"""Cases""","""Guinea"""
"""1/2/2015""",286,"""Cases_Guinea""",,"""Cases""","""Guinea"""
"""12/31/2014""",284,"""Cases_Guinea""",2730.0,"""Cases""","""Guinea"""


### 