# 第8章 groupby演算による分割-適用-結合

## 8.1 集約

### 8.1.1 1この変数で分割する基本的な集約

In [37]:
import polars as pl

In [38]:
df = pl.read_csv("../data/gapminder.tsv", separator = "\t")
df.head()

country,continent,year,lifeExp,pop,gdpPercap
str,str,i64,f64,i64,f64
"""Afghanistan""","""Asia""",1952,28.801,8425333,779.445314
"""Afghanistan""","""Asia""",1957,30.332,9240934,820.85303
"""Afghanistan""","""Asia""",1962,31.997,10267083,853.10071
"""Afghanistan""","""Asia""",1967,34.02,11537966,836.197138
"""Afghanistan""","""Asia""",1972,36.088,13079460,739.981106


In [39]:
avg_life_by_year = df.group_by(by = "year").agg( pl.col("lifeExp").mean() ).sort(by = "year")
print(avg_life_by_year)

shape: (12, 2)
┌──────┬───────────┐
│ year ┆ lifeExp   │
│ ---  ┆ ---       │
│ i64  ┆ f64       │
╞══════╪═══════════╡
│ 1952 ┆ 49.05762  │
│ 1957 ┆ 51.507401 │
│ 1962 ┆ 53.609249 │
│ 1967 ┆ 55.67829  │
│ …    ┆ …         │
│ 1992 ┆ 64.160338 │
│ 1997 ┆ 65.014676 │
│ 2002 ┆ 65.694923 │
│ 2007 ┆ 67.007423 │
└──────┴───────────┘


In [40]:
#pandasの場合、uniqueメソッドでリストを取得できる
#polarsの場合、to_listメソッドも必要
years = df["year"].unique().to_list()
print(years)

[1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007]


In [41]:
y1952 = df.filter( pl.col("year") == 1952 )
print(y1952)

shape: (142, 6)
┌────────────────────┬───────────┬──────┬─────────┬─────────┬─────────────┐
│ country            ┆ continent ┆ year ┆ lifeExp ┆ pop     ┆ gdpPercap   │
│ ---                ┆ ---       ┆ ---  ┆ ---     ┆ ---     ┆ ---         │
│ str                ┆ str       ┆ i64  ┆ f64     ┆ i64     ┆ f64         │
╞════════════════════╪═══════════╪══════╪═════════╪═════════╪═════════════╡
│ Afghanistan        ┆ Asia      ┆ 1952 ┆ 28.801  ┆ 8425333 ┆ 779.445314  │
│ Albania            ┆ Europe    ┆ 1952 ┆ 55.23   ┆ 1282697 ┆ 1601.056136 │
│ Algeria            ┆ Africa    ┆ 1952 ┆ 43.077  ┆ 9279525 ┆ 2449.008185 │
│ Angola             ┆ Africa    ┆ 1952 ┆ 30.015  ┆ 4232095 ┆ 3520.610273 │
│ …                  ┆ …         ┆ …    ┆ …       ┆ …       ┆ …           │
│ West Bank and Gaza ┆ Asia      ┆ 1952 ┆ 43.16   ┆ 1030585 ┆ 1515.592329 │
│ Yemen, Rep.        ┆ Asia      ┆ 1952 ┆ 32.548  ┆ 4963829 ┆ 781.717576  │
│ Zambia             ┆ Africa    ┆ 1952 ┆ 42.038  ┆ 2672000 ┆ 1147.38883

In [42]:
y1952_mean = y1952["lifeExp"].mean()
print(y1952_mean)

49.05761971830986


### 8.1.2 組込みの集約メソッド

In [43]:
print( df.group_by(by = "year").agg( pl.col("country").count() ).head(1) )
# polarsには無いようだ
#print( df.group_by(by = "year").agg( pl.col("country").size() ) )
print( df.group_by(by = "year").agg( pl.col("pop").mean() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").std() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").min() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").quantile(quantile = 0.25) ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").quantile(quantile = 0.5) ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").quantile(quantile = 0.75) ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").max() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").sum() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").var() ).head(1) )
#print( df.group_by(by = "year").agg( pl.col("pop").sem() ).head(1) )
#print( df.group_by(by = "year").agg( pl.col("pop").describe() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").first() ).head(1) )
print( df.group_by(by = "year").agg( pl.col("pop").last() ).head(1) )
#print( df.group_by(by = "year").agg( pl.col("pop").nth() ).head(1) )

shape: (1, 2)
┌──────┬─────────┐
│ year ┆ country │
│ ---  ┆ ---     │
│ i64  ┆ u32     │
╞══════╪═════════╡
│ 1992 ┆ 142     │
└──────┴─────────┘
shape: (1, 2)
┌──────┬──────────┐
│ year ┆ pop      │
│ ---  ┆ ---      │
│ i64  ┆ f64      │
╞══════╪══════════╡
│ 1952 ┆ 1.6950e7 │
└──────┴──────────┘
shape: (1, 2)
┌──────┬──────────┐
│ year ┆ pop      │
│ ---  ┆ ---      │
│ i64  ┆ f64      │
╞══════╪══════════╡
│ 1992 ┆ 1.2450e8 │
└──────┴──────────┘
shape: (1, 2)
┌──────┬────────┐
│ year ┆ pop    │
│ ---  ┆ ---    │
│ i64  ┆ i64    │
╞══════╪════════╡
│ 1997 ┆ 145608 │
└──────┴────────┘
shape: (1, 2)
┌──────┬────────────┐
│ year ┆ pop        │
│ ---  ┆ ---        │
│ i64  ┆ f64        │
╞══════╪════════════╡
│ 1977 ┆ 2.721783e6 │
└──────┴────────────┘
shape: (1, 2)
┌──────┬────────────┐
│ year ┆ pop        │
│ ---  ┆ ---        │
│ i64  ┆ f64        │
╞══════╪════════════╡
│ 1977 ┆ 6.491649e6 │
└──────┴────────────┘
shape: (1, 2)
┌──────┬─────────────┐
│ year ┆ pop         │
│ ---  ┆ 

shape: (1, 2)
┌──────┬────────────┐
│ year ┆ pop        │
│ ---  ┆ ---        │
│ i64  ┆ i64        │
╞══════╪════════════╡
│ 1997 ┆ 5515204472 │
└──────┴────────────┘
shape: (1, 2)
┌──────┬───────────┐
│ year ┆ pop       │
│ ---  ┆ ---       │
│ i64  ┆ f64       │
╞══════╪═══════════╡
│ 1992 ┆ 1.5501e16 │
└──────┴───────────┘
shape: (1, 2)
┌──────┬─────────┐
│ year ┆ pop     │
│ ---  ┆ ---     │
│ i64  ┆ i64     │
╞══════╪═════════╡
│ 1952 ┆ 8425333 │
└──────┴─────────┘
shape: (1, 2)
┌──────┬──────────┐
│ year ┆ pop      │
│ ---  ┆ ---      │
│ i64  ┆ i64      │
╞══════╪══════════╡
│ 2002 ┆ 11926563 │
└──────┴──────────┘


In [44]:
# pandasだとgroupbyメソッドの後に["列名"]を取れるが、polarsではできない
# またdescribeも実装していないので、工夫が必要
#continent_describe = df.group_by(by = "continent")["lifeExp"].describe()
# 
def describe_polars(df_argument = pl.DataFrame({"Fruit": ["apple", "apple", "apple", "lemon", "lemon", "lemon"],
                                                 "Price": [90, 100, 110, 180, 200, 220]}),
                     by_argument = "Fruit",
                       explain_argument = "Price"):
    df_group_by = df_argument.group_by(by = by_argument)

    result = df_group_by\
        .agg( pl.col(explain_argument).count() ).rename(mapping = {explain_argument: "count"})\
            .with_columns([
                df_group_by.agg( pl.col(explain_argument).mean() )[explain_argument].alias("mean"),
                df_group_by.agg( pl.col(explain_argument).std() )[explain_argument].alias("std"),
                df_group_by.agg( pl.col(explain_argument).min() )[explain_argument].alias("min"),
                df_group_by.agg( pl.col(explain_argument).quantile(quantile = 0.25) )[explain_argument].alias("25%"),
                df_group_by.agg( pl.col(explain_argument).quantile(quantile = 0.5) )[explain_argument].alias("50%"),
                df_group_by.agg( pl.col(explain_argument).quantile(quantile = 0.75) )[explain_argument].alias("75%"),
                df_group_by.agg( pl.col(explain_argument).max() )[explain_argument].alias("max")
                ])
    
    return result



continent = df.group_by(by = "continent")

col_name = "lifeExp"

continent_describe = continent.agg( pl.col(col_name).count() ).rename(mapping = {col_name: "Count"}).with_columns([
    continent.agg( pl.col(col_name).mean() )[col_name].alias("mean"),
    continent.agg( pl.col(col_name).std() )[col_name].alias("std"),
    continent.agg( pl.col(col_name).min() )[col_name].alias("min"),
    continent.agg( pl.col(col_name).quantile(quantile = 0.25) )[col_name].alias("25%"),
    continent.agg( pl.col(col_name).quantile(quantile = 0.5) )[col_name].alias("50%"),
    continent.agg( pl.col(col_name).quantile(quantile = 0.75) )[col_name].alias("75%"),
    continent.agg( pl.col(col_name).max() )[col_name].alias("max")
])

continent_describe

continent,Count,mean,std,min,25%,50%,75%,max
str,u32,f64,f64,f64,f64,f64,f64,f64
"""Americas""",300,71.903686,5.433178,23.599,69.58,67.052,69.521,81.235
"""Oceania""",24,64.658737,9.15021,69.12,42.384,47.8,71.752,81.757
"""Asia""",396,60.064903,9.345088,28.801,71.24,73.84,54.425,80.653
"""Africa""",624,74.326208,3.795611,43.585,58.447,72.25,75.467,76.442
"""Europe""",360,48.86533,11.864532,37.579,51.457,61.818,77.56,82.603


In [45]:
describe_polars(df, "continent", "lifeExp")


continent,count,mean,std,min,25%,50%,75%,max
str,u32,f64,f64,f64,f64,f64,f64,f64
"""Americas""",300,60.064903,3.795611,28.801,71.24,67.052,69.521,81.757
"""Europe""",360,74.326208,9.15021,37.579,58.447,47.8,71.752,81.235
"""Oceania""",24,64.658737,9.345088,43.585,69.58,73.84,77.56,82.603
"""Africa""",624,71.903686,11.864532,69.12,51.457,72.25,75.467,80.653
"""Asia""",396,48.86533,5.433178,23.599,42.384,61.818,54.425,76.442


### 8.1.3 集約関数

#### 8.1.3.1 他のライブラリの関数

In [54]:
import numpy as np

# polarsは新しいライブラリのため、他のライブラリとの互換性がないので、to_pandasを行っておく
# 良くわからないエラーが出た
df_pandas = df.to_pandas()
cont_le_agg = df_pandas.groupby("continent")["lifeExp"].agg(np.mean)
print(cont_le_agg)

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64


  cont_le_agg = df_pandas.groupby("continent")["lifeExp"].agg(np.mean)


#### 8.1.3.2 カスタムのユーザー関数

In [56]:
def my_mean(values):
    n = len(values)

    sum = 0
    for value in values:
        sum += value
    
    return sum / n

In [57]:
agg_my_mean = df.group_by("year").agg( pl.col("lifeExp").map_elements(my_mean) )
print(agg_my_mean)

shape: (12, 2)
┌──────┬───────────┐
│ year ┆ lifeExp   │
│ ---  ┆ ---       │
│ i64  ┆ f64       │
╞══════╪═══════════╡
│ 1977 ┆ 59.570157 │
│ 1992 ┆ 64.160338 │
│ 2007 ┆ 67.007423 │
│ 1957 ┆ 51.507401 │
│ …    ┆ …         │
│ 1967 ┆ 55.67829  │
│ 1997 ┆ 65.014676 │
│ 1982 ┆ 61.533197 │
│ 1962 ┆ 53.609249 │
└──────┴───────────┘


### 8.1.4 複数の関数を同時に計算する

In [66]:
gdf = df\
    .group_by("year")\
        .agg([pl.col("lifeExp").count().alias("count"),
              pl.col("lifeExp").mean().alias("mean"),
              pl.col("lifeExp").std().alias("std")])
print(gdf)

shape: (12, 4)
┌──────┬───────┬───────────┬───────────┐
│ year ┆ count ┆ mean      ┆ std       │
│ ---  ┆ ---   ┆ ---       ┆ ---       │
│ i64  ┆ u32   ┆ f64       ┆ f64       │
╞══════╪═══════╪═══════════╪═══════════╡
│ 1977 ┆ 142   ┆ 59.570157 ┆ 11.227229 │
│ 1992 ┆ 142   ┆ 64.160338 ┆ 11.22738  │
│ 2007 ┆ 142   ┆ 67.007423 ┆ 12.073021 │
│ 1957 ┆ 142   ┆ 51.507401 ┆ 12.231286 │
│ …    ┆ …     ┆ …         ┆ …         │
│ 1982 ┆ 142   ┆ 61.533197 ┆ 10.770618 │
│ 1997 ┆ 142   ┆ 65.014676 ┆ 11.559439 │
│ 1967 ┆ 142   ┆ 55.67829  ┆ 11.718858 │
│ 1962 ┆ 142   ┆ 53.609249 ┆ 12.097245 │
└──────┴───────┴───────────┴───────────┘


### 8.1.5 .agg/aggregateでdictを使う

#### 8.1.5.1 DataFrameに対するdictの指定

In [68]:
# polarsのaggではdictはつかえないようだ
'''
gdf = df\
    .group_by("year")\
        .agg({
            "lifeExp": "mean",
            "pop": "median",
            "gdpPercap": "median"
        })
print(gdf)
'''

'\ngdf = df    .group_by("year")        .agg({\n            "lifeExp": "mean",\n            "pop": "median",\n            "gdpPercap": "median"\n        })\nprint(gdf)\n'

## 8.2 変換(transform)

### 8.2.1 zスコアの例
polarsにはtransformが無いので省略

### 8.2.2 欠損値の例
この句はpandasで書いている

In [70]:
import seaborn as sns
import numpy as np

np.random.seed(42)

tips_10 = sns.load_dataset("tips").sample(10)

tips_10.loc[
    np.random.permutation(tips_10.index)[:4],
    "total_bill"] = np.NaN

print(tips_10)

     total_bill   tip     sex smoker   day    time  size
24        19.82  3.18    Male     No   Sat  Dinner     2
6          8.77  2.00    Male     No   Sun  Dinner     2
153         NaN  2.00    Male     No   Sun  Dinner     4
211         NaN  5.16    Male    Yes   Sat  Dinner     4
198         NaN  2.00  Female    Yes  Thur   Lunch     2
176         NaN  2.00    Male    Yes   Sun  Dinner     2
192       28.44  2.56    Male    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
9         14.78  3.23    Male     No   Sun  Dinner     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2


In [71]:
count_sex = tips_10.groupby("sex").count()
print(count_sex)

        total_bill  tip  smoker  day  time  size
sex                                             
Male             4    7       7    7     7     7
Female           2    3       3    3     3     3


  count_sex = tips_10.groupby("sex").count()


In [73]:
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)

total_bill_group_mean = tips_10.groupby("sex").total_bill.transform(fill_na_mean)

tips_10["fill_total_bill"] = total_bill_group_mean

     total_bill   tip     sex smoker   day    time  size  fill_total_bill
24        19.82  3.18    Male     No   Sat  Dinner     2          19.8200
6          8.77  2.00    Male     No   Sun  Dinner     2           8.7700
153         NaN  2.00    Male     No   Sun  Dinner     4          17.9525
211         NaN  5.16    Male    Yes   Sat  Dinner     4          17.9525
198         NaN  2.00  Female    Yes  Thur   Lunch     2          13.9300
176         NaN  2.00    Male    Yes   Sun  Dinner     2          17.9525
192       28.44  2.56    Male    Yes  Thur   Lunch     2          28.4400
124       12.48  2.52  Female     No  Thur   Lunch     2          12.4800
9         14.78  3.23    Male     No   Sun  Dinner     2          14.7800
101       15.38  3.00  Female    Yes   Fri  Dinner     2          15.3800


  total_bill_group_mean = tips_10.groupby("sex").total_bill.transform(fill_na_mean)


In [74]:
print(tips_10[["sex", "total_bill", "fill_total_bill"]])

        sex  total_bill  fill_total_bill
24     Male       19.82          19.8200
6      Male        8.77           8.7700
153    Male         NaN          17.9525
211    Male         NaN          17.9525
198  Female         NaN          13.9300
176    Male         NaN          17.9525
192    Male       28.44          28.4400
124  Female       12.48          12.4800
9      Male       14.78          14.7800
101  Female       15.38          15.3800


## 8.3 フィルタリング
polarsとpandasで少し挙動が異なる

In [75]:
tips = pl.DataFrame( sns.load_dataset("tips") )

print(tips.shape)

print(tips["size"].value_counts())

(244, 7)
shape: (6, 2)
┌──────┬────────┐
│ size ┆ counts │
│ ---  ┆ ---    │
│ i64  ┆ u32    │
╞══════╪════════╡
│ 1    ┆ 4      │
│ 3    ┆ 38     │
│ 4    ┆ 37     │
│ 6    ┆ 4      │
│ 2    ┆ 156    │
│ 5    ┆ 5      │
└──────┴────────┘


In [80]:
tips_filtered = tips.group_by(by = "size").agg( pl.col("size").count().alias("counts") ).filter( pl.col("counts") >= 30 )
print(tips_filtered)
print(tips_filtered.shape)


shape: (3, 2)
┌──────┬────────┐
│ size ┆ counts │
│ ---  ┆ ---    │
│ i64  ┆ u32    │
╞══════╪════════╡
│ 3    ┆ 38     │
│ 2    ┆ 156    │
│ 4    ┆ 37     │
└──────┴────────┘
(3, 2)


## 8.4 DataFrameGroupByオブジェクト

### 8.4.1 group

In [81]:
tips_10 = pl.DataFrame( sns.load_dataset("tips").sample(10, random_state = 42) )
print(tips_10)

shape: (10, 7)
┌────────────┬──────┬────────┬────────┬──────┬────────┬──────┐
│ total_bill ┆ tip  ┆ sex    ┆ smoker ┆ day  ┆ time   ┆ size │
│ ---        ┆ ---  ┆ ---    ┆ ---    ┆ ---  ┆ ---    ┆ ---  │
│ f64        ┆ f64  ┆ cat    ┆ cat    ┆ cat  ┆ cat    ┆ i64  │
╞════════════╪══════╪════════╪════════╪══════╪════════╪══════╡
│ 19.82      ┆ 3.18 ┆ Male   ┆ No     ┆ Sat  ┆ Dinner ┆ 2    │
│ 8.77       ┆ 2.0  ┆ Male   ┆ No     ┆ Sun  ┆ Dinner ┆ 2    │
│ 24.55      ┆ 2.0  ┆ Male   ┆ No     ┆ Sun  ┆ Dinner ┆ 4    │
│ 25.89      ┆ 5.16 ┆ Male   ┆ Yes    ┆ Sat  ┆ Dinner ┆ 4    │
│ …          ┆ …    ┆ …      ┆ …      ┆ …    ┆ …      ┆ …    │
│ 28.44      ┆ 2.56 ┆ Male   ┆ Yes    ┆ Thur ┆ Lunch  ┆ 2    │
│ 12.48      ┆ 2.52 ┆ Female ┆ No     ┆ Thur ┆ Lunch  ┆ 2    │
│ 14.78      ┆ 3.23 ┆ Male   ┆ No     ┆ Sun  ┆ Dinner ┆ 2    │
│ 15.38      ┆ 3.0  ┆ Female ┆ Yes    ┆ Fri  ┆ Dinner ┆ 2    │
└────────────┴──────┴────────┴────────┴──────┴────────┴──────┘


In [82]:
grouped = tips_10.group_by("sex")
print(grouped)

<polars.dataframe.group_by.GroupBy object at 0x000002B22A973010>


In [83]:
print(grouped.all())

shape: (2, 7)
┌────────┬───────────────┬───────────────┬───────────────┬───────────┬───────────────┬─────────────┐
│ sex    ┆ total_bill    ┆ tip           ┆ smoker        ┆ day       ┆ time          ┆ size        │
│ ---    ┆ ---           ┆ ---           ┆ ---           ┆ ---       ┆ ---           ┆ ---         │
│ cat    ┆ list[f64]     ┆ list[f64]     ┆ list[cat]     ┆ list[cat] ┆ list[cat]     ┆ list[i64]   │
╞════════╪═══════════════╪═══════════════╪═══════════════╪═══════════╪═══════════════╪═════════════╡
│ Female ┆ [13.0, 12.48, ┆ [2.0, 2.52,   ┆ ["Yes", "No", ┆ ["Thur",  ┆ ["Lunch",     ┆ [2, 2, 2]   │
│        ┆ 15.38]        ┆ 3.0]          ┆ "Yes"]        ┆ "Thur",   ┆ "Lunch",      ┆             │
│        ┆               ┆               ┆               ┆ "Fri"]    ┆ "Dinner"]     ┆             │
│ Male   ┆ [19.82, 8.77, ┆ [3.18, 2.0, … ┆ ["No", "No",  ┆ ["Sat",   ┆ ["Dinner",    ┆ [2, 2, … 2] │
│        ┆ … 14.78]      ┆ 3.23]         ┆ … "No"]       ┆ "Sun", …  ┆ "Dinne

### 8.4.2 複数の変数にかかわるグループ計算

In [84]:
avgs = grouped.mean()
print(avgs)

shape: (2, 7)
┌────────┬────────────┬──────────┬────────┬──────┬──────┬──────────┐
│ sex    ┆ total_bill ┆ tip      ┆ smoker ┆ day  ┆ time ┆ size     │
│ ---    ┆ ---        ┆ ---      ┆ ---    ┆ ---  ┆ ---  ┆ ---      │
│ cat    ┆ f64        ┆ f64      ┆ cat    ┆ cat  ┆ cat  ┆ f64      │
╞════════╪════════════╪══════════╪════════╪══════╪══════╪══════════╡
│ Female ┆ 13.62      ┆ 2.506667 ┆ null   ┆ null ┆ null ┆ 2.0      │
│ Male   ┆ 20.02      ┆ 2.875714 ┆ null   ┆ null ┆ null ┆ 2.571429 │
└────────┴────────────┴──────────┴────────┴──────┴──────┴──────────┘


In [85]:
print(tips_10.columns)

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']


### 8.4.3 グループの抽出

In [86]:
female = grouped.map_groups(lambda group_df: group_df.filter( pl.col("sex") == "Female" ))
print(female)

shape: (3, 7)
┌────────────┬──────┬────────┬────────┬──────┬────────┬──────┐
│ total_bill ┆ tip  ┆ sex    ┆ smoker ┆ day  ┆ time   ┆ size │
│ ---        ┆ ---  ┆ ---    ┆ ---    ┆ ---  ┆ ---    ┆ ---  │
│ f64        ┆ f64  ┆ cat    ┆ cat    ┆ cat  ┆ cat    ┆ i64  │
╞════════════╪══════╪════════╪════════╪══════╪════════╪══════╡
│ 13.0       ┆ 2.0  ┆ Female ┆ Yes    ┆ Thur ┆ Lunch  ┆ 2    │
│ 12.48      ┆ 2.52 ┆ Female ┆ No     ┆ Thur ┆ Lunch  ┆ 2    │
│ 15.38      ┆ 3.0  ┆ Female ┆ Yes    ┆ Fri  ┆ Dinner ┆ 2    │
└────────────┴──────┴────────┴────────┴──────┴────────┴──────┘


### 8.4.4 グループごとの反復処理

In [87]:
for sex_group in grouped:
    print(sex_group)

('Female', shape: (3, 7)
┌────────────┬──────┬────────┬────────┬──────┬────────┬──────┐
│ total_bill ┆ tip  ┆ sex    ┆ smoker ┆ day  ┆ time   ┆ size │
│ ---        ┆ ---  ┆ ---    ┆ ---    ┆ ---  ┆ ---    ┆ ---  │
│ f64        ┆ f64  ┆ cat    ┆ cat    ┆ cat  ┆ cat    ┆ i64  │
╞════════════╪══════╪════════╪════════╪══════╪════════╪══════╡
│ 13.0       ┆ 2.0  ┆ Female ┆ Yes    ┆ Thur ┆ Lunch  ┆ 2    │
│ 12.48      ┆ 2.52 ┆ Female ┆ No     ┆ Thur ┆ Lunch  ┆ 2    │
│ 15.38      ┆ 3.0  ┆ Female ┆ Yes    ┆ Fri  ┆ Dinner ┆ 2    │
└────────────┴──────┴────────┴────────┴──────┴────────┴──────┘)
('Male', shape: (7, 7)
┌────────────┬──────┬──────┬────────┬──────┬────────┬──────┐
│ total_bill ┆ tip  ┆ sex  ┆ smoker ┆ day  ┆ time   ┆ size │
│ ---        ┆ ---  ┆ ---  ┆ ---    ┆ ---  ┆ ---    ┆ ---  │
│ f64        ┆ f64  ┆ cat  ┆ cat    ┆ cat  ┆ cat    ┆ i64  │
╞════════════╪══════╪══════╪════════╪══════╪════════╪══════╡
│ 19.82      ┆ 3.18 ┆ Male ┆ No     ┆ Sat  ┆ Dinner ┆ 2    │
│ 8.77       ┆ 2.0

### 8.4.5 複数の変数によるグループ化

In [88]:
bill_sex_time = tips_10.group_by(["sex", "time"])
group_avg = bill_sex_time.mean()
group_avg

sex,time,total_bill,tip,smoker,day,size
cat,cat,f64,f64,cat,cat,f64
"""Female""","""Dinner""",15.38,3.0,,,2.0
"""Female""","""Lunch""",12.74,2.26,,,2.0
"""Male""","""Dinner""",18.616667,2.928333,,,2.666667
"""Male""","""Lunch""",28.44,2.56,,,2.0


### 8.4.6 結果を平坦化する(reset_index)
polarsにはindexが無いため、省略

## 8.5 マルチインデックスの使い方
現状のpolarsにはzipファイルを開く方法が無いので、省略

In [90]:
#intv_df = pl.read_csv("../data/epi_sim.zip", truncate_ragged_lines = True)
#print(intv_df)

ComputeError: invalid utf-8 sequence in csv