# はじめに

このノートではPythonのpolarsライブラリの使い方をまとめておく。polarsライブラリはRustで書かれており、高速に動作することやメモリ効率が良いことが特徴での大規模データを処理するのに適しているライブラリ。

- [Polars — DataFrames for the new era](https://pola.rs/)

今回はこちらの内容をなぞっているだけです。

- [pandasから移行する人向け polars使用ガイド #Python - Qiita](https://qiita.com/nkay/items/9cfb2776156dc7e054c8)

## ライブラリの読み込み

In [263]:
# load library
import polars as pl
import numpy as np
import datetime

titanic = pl.read_csv('./data/titanic.csv')
diamonds = pl.read_csv('./data/diamonds.csv')

# display 10 rows
pl.Config.set_tbl_rows(10)

df = pl.DataFrame(
    {
        'Integer': [1, 2, 3, 4],
        'Float': np.array([1, 2, 3, 4], dtype=float),
        'Datetime': [datetime.datetime(2024, 4, 1)] * 4,
        'Date': [datetime.datetime(2024, 1, 1)] * 4,
        'String': ['test', 'train', 'test', 'train'],
        'Boolean': [True, True, False, None]
    }
)

## データフレームの視覚的確認

In [264]:
titanic.glimpse()

Rows: 891
Columns: 12
$ PassengerId <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Survived    <i64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
$ Pclass      <i64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2
$ Name        <str> 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss. Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'
$ Sex         <str> 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'
$ Age         <f64> 22.0, 38.0, 26.0, 35.0, 35.0, None, 54.0, 2.0, 27.0, 14.0
$ SibSp       <i64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1
$ Parch       <i64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0
$ Ticket      <str> 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'
$ Fare        <f64> 7.25, 71.2833, 7

In [265]:
titanic.describe()

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",224.0,0.0,2.0,,,20.0,0.0,0.0,,7.925,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke, Mr. Philemon""","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


## 行の抽出・フィルタリング・条件選択


In [266]:
titanic.filter(pl.col('Sex') == 'male')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
884,0,2,"""Banfield, Mr. Frederick James""","""male""",28.0,0,0,"""C.A./SOTON 34068""",10.5,,"""S"""
885,0,3,"""Sutehall, Mr. Henry Jr""","""male""",25.0,0,0,"""SOTON/OQ 392076""",7.05,,"""S"""
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""


In [267]:
titanic.filter(
  (pl.col('Sex') == 'male') & 
  (pl.col('PassengerId') < 20)
  )

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
13,0,3,"""Saundercock, Mr. William Henry""","""male""",20.0,0,0,"""A/5. 2151""",8.05,,"""S"""
14,0,3,"""Andersson, Mr. Anders Johan""","""male""",39.0,1,5,"""347082""",31.275,,"""S"""
17,0,3,"""Rice, Master. Eugene""","""male""",2.0,4,1,"""382652""",29.125,,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,,"""S"""


In [268]:
titanic\
  .filter(pl.col('Sex') == 'male')\
  .filter(pl.col('PassengerId') < 20)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
13,0,3,"""Saundercock, Mr. William Henry""","""male""",20.0,0,0,"""A/5. 2151""",8.05,,"""S"""
14,0,3,"""Andersson, Mr. Anders Johan""","""male""",39.0,1,5,"""347082""",31.275,,"""S"""
17,0,3,"""Rice, Master. Eugene""","""male""",2.0,4,1,"""382652""",29.125,,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,,"""S"""


In [269]:
# titanic.filter(
#   (pl.col('Sex') == 'male') & 
#   (pl.col('PassengerId') < 20)
#   )
# これと同じ
titanic.filter(pl.col('Sex') == 'male', pl.col('PassengerId') < 20)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
13,0,3,"""Saundercock, Mr. William Henry""","""male""",20.0,0,0,"""A/5. 2151""",8.05,,"""S"""
14,0,3,"""Andersson, Mr. Anders Johan""","""male""",39.0,1,5,"""347082""",31.275,,"""S"""
17,0,3,"""Rice, Master. Eugene""","""male""",2.0,4,1,"""382652""",29.125,,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,,"""S"""


In [270]:
df.filter([False, True, False, True])


Integer,Float,Datetime,Date,String,Boolean
i64,f64,datetime[μs],datetime[μs],str,bool
2,2.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""train""",True
4,4.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""train""",


In [271]:
titanic.get_column('Embarked').is_in(['Q'])

Embarked
bool
false
false
false
false
false
…
false
false
false
false


In [272]:
titanic.filter(titanic.get_column('PassengerId').is_in([1,3,5,7,9]))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
9,1,3,"""Johnson, Mrs. Oscar W (Elisabe…","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""


## ユニーク行・重複行


In [273]:
titanic.unique('Pclass')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
10,1,2,"""Nasser, Mrs. Nicholas (Adele A…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [274]:
titanic.select('Pclass', 'Embarked').unique(subset=['Pclass', 'Embarked'], keep='any').sort('Pclass', 'Embarked')

Pclass,Embarked
i64,str
1,
1,"""C"""
1,"""Q"""
1,"""S"""
2,"""C"""
2,"""Q"""
2,"""S"""
3,"""C"""
3,"""Q"""
3,"""S"""


In [275]:
df_dup = pl.DataFrame(
    {
        'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four', 'four'],
        'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x', 'x']
    }
)
df_dup


a,b
str,str
"""one""","""x"""
"""one""","""y"""
"""two""","""x"""
"""two""","""y"""
"""two""","""x"""
"""three""","""x"""
"""four""","""x"""
"""four""","""x"""


In [276]:
df_dup.is_duplicated()

false
False
True
False
True
False
True
True


In [277]:
df_dup.is_unique()

true
True
False
True
False
True
False
False


## 列の選択と追加


In [278]:
# titanic.select('Sex'): dataframe
# series
titanic.get_column('Sex')


Sex
str
"""male"""
"""female"""
"""female"""
"""female"""
"""male"""
…
"""male"""
"""female"""
"""female"""
"""male"""


In [279]:
# titanic.get_column_index('Ticket'): 8
titanic.get_columns()[titanic.get_column_index('Ticket')]

Ticket
str
"""A/5 21171"""
"""PC 17599"""
"""STON/O2. 3101282"""
"""113803"""
"""373450"""
…
"""211536"""
"""112053"""
"""W./C. 6607"""
"""111369"""


In [280]:
# titanic.with_columns(NewPassengerId = titanic.get_column('PassengerId')* 100)
titanic.with_columns(NewPassengerId = pl.col('PassengerId')* 100)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NewPassengerId
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,i64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",100
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",200
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",300
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""",400
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""",500
…,…,…,…,…,…,…,…,…,…,…,…,…
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S""",88700
888,1,1,"""Graham, Miss. Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S""",88800
889,0,3,"""Johnston, Miss. Catherine Hele…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S""",88900
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C""",89000


In [281]:
df.with_row_count()


  df.with_row_count()


row_nr,Integer,Float,Datetime,Date,String,Boolean
u32,i64,f64,datetime[μs],datetime[μs],str,bool
0,1,1.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""test""",True
1,2,2.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""train""",True
2,3,3.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""test""",False
3,4,4.0,2024-04-01 00:00:00,2024-01-01 00:00:00,"""train""",


## 基礎的な操作と演算

In [282]:
titanic.sum()
titanic.mean()
titanic.quantile(0.5)


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,


In [283]:
titanic['Sex'].value_counts()

Sex,count
str,u32
"""female""",314
"""male""",577


In [284]:
# titanic.select('Sex').value_counts(): dataframe
# series
titanic.get_column('Sex').value_counts()

Sex,count
str,u32
"""female""",314
"""male""",577


## 列操作：エクスプレッションの主な使い方


In [285]:
# titanic.select('Pclass', 'Embarked')でもOK
titanic.select(pl.col('Pclass', 'Embarked'))


Pclass,Embarked
i64,str
3,"""S"""
1,"""C"""
3,"""S"""
1,"""S"""
3,"""S"""
…,…
2,"""S"""
1,"""S"""
3,"""S"""
1,"""C"""


In [286]:
titanic.select(pl.col(pl.Utf8))

Name,Sex,Ticket,Cabin,Embarked
str,str,str,str,str
"""Braund, Mr. Owen Harris""","""male""","""A/5 21171""",,"""S"""
"""Cumings, Mrs. John Bradley (Fl…","""female""","""PC 17599""","""C85""","""C"""
"""Heikkinen, Miss. Laina""","""female""","""STON/O2. 3101282""",,"""S"""
"""Futrelle, Mrs. Jacques Heath (…","""female""","""113803""","""C123""","""S"""
"""Allen, Mr. William Henry""","""male""","""373450""",,"""S"""
…,…,…,…,…
"""Montvila, Rev. Juozas""","""male""","""211536""",,"""S"""
"""Graham, Miss. Margaret Edith""","""female""","""112053""","""B42""","""S"""
"""Johnston, Miss. Catherine Hele…","""female""","""W./C. 6607""",,"""S"""
"""Behr, Mr. Karl Howell""","""male""","""111369""","""C148""","""C"""


In [287]:
# titanic.select(pl.col('Name').sort())
titanic.select(pl.col('Name'), pl.col('PassengerId')).sort('Name')


Name,PassengerId
str,i64
"""Abbing, Mr. Anthony""",846
"""Abbott, Mr. Rossmore Edward""",747
"""Abbott, Mrs. Stanton (Rosa Hun…",280
"""Abelson, Mr. Samuel""",309
"""Abelson, Mrs. Samuel (Hannah W…",875
…,…
"""de Mulder, Mr. Theodore""",287
"""de Pelsmaeker, Mr. Alfons""",283
"""del Carlo, Mr. Sebastiano""",362
"""van Billiard, Mr. Austin Blyle…",154


In [288]:
titanic.select('Sex').with_columns(NewSex = pl.when(pl.col('Sex') == 'male').then(pl.lit('M')).otherwise(pl.lit('F')))

Sex,NewSex
str,str
"""male""","""M"""
"""female""","""F"""
"""female""","""F"""
"""female""","""F"""
"""male""","""M"""
…,…
"""male""","""M"""
"""female""","""F"""
"""female""","""F"""
"""male""","""M"""


## 複数のデータフレームの結合・マージ


In [289]:
df1 = pl.DataFrame(
    {'A': ['A0', 'A1'], 'B': ['B0', 'B1'], 'C': ['C0', 'C1'], 'D': ['D0', 'D1']}
)
df2 = pl.DataFrame(
    {'A': ['A2', 'A3'], 'B': ['B2', 'B3'], 'C': ['C2', 'C3'], 'D': ['D2', 'D3']}
)
df3 = pl.DataFrame(
    {'A': ['A4', 'A5'], 'B': ['B4', 'B5'], 'C': ['C4', 'C5'], 'DD': ['D4', 'D5']}
)

In [290]:
pl.concat([df1, df2, df3], how='diagonal')


A,B,C,D,DD
str,str,str,str,str
"""A0""","""B0""","""C0""","""D0""",
"""A1""","""B1""","""C1""","""D1""",
"""A2""","""B2""","""C2""","""D2""",
"""A3""","""B3""","""C3""","""D3""",
"""A4""","""B4""","""C4""",,"""D4"""
"""A5""","""B5""","""C5""",,"""D5"""


In [291]:
df1 = pl.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
df2 = pl.DataFrame({'C': ['C0', 'C1'], 'D': ['D0', 'D1']})
pl.concat([df1, df2], how='horizontal')

A,B,C,D
str,str,str,str
"""A0""","""B0""","""C0""","""D0"""
"""A1""","""B1""","""C1""","""D1"""
"""A2""","""B2""",,


In [292]:
left_t = pl.DataFrame(
    {
        'key1': ['K0', 'K0', 'K1', 'K2'],
        'key2': ['K0', 'K1', 'K0', 'K1'],
        'A': ['A0', 'A1', 'A2', 'A3'],
        'B': ['B0', 'B1', 'B2', 'B3'],
    }
)
right_t = pl.DataFrame(
    {
        'key1': ['K0', 'K1', 'K1', 'K2'],
        'key2': ['K0', 'K0', 'K0', 'K0'],
        'C': ['C0', 'C1', 'C2', 'C3'],
        'D': ['D0', 'D1', 'D2', 'D3'],
    }
)
left_t.join(right_t, on=['key1', 'key2'])


key1,key2,A,B,C,D
str,str,str,str,str,str
"""K0""","""K0""","""A0""","""B0""","""C0""","""D0"""
"""K1""","""K0""","""A2""","""B2""","""C1""","""D1"""
"""K1""","""K0""","""A2""","""B2""","""C2""","""D2"""


In [293]:
left_t.join(right_t, on=['key1', 'key2'], how='left')

key1,key2,A,B,C,D
str,str,str,str,str,str
"""K0""","""K0""","""A0""","""B0""","""C0""","""D0"""
"""K0""","""K1""","""A1""","""B1""",,
"""K1""","""K0""","""A2""","""B2""","""C1""","""D1"""
"""K1""","""K0""","""A2""","""B2""","""C2""","""D2"""
"""K2""","""K1""","""A3""","""B3""",,


## GroupBy

In [294]:
titanic.group_by('Sex').sum()

Sex,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,i64,i64,i64,str,f64,i64,i64,str,f64,str,str
"""female""",135343,233,678,,7286.0,218,204,,13966.6628,,
"""male""",262043,109,1379,,13919.17,248,136,,14727.2865,,


In [295]:
titanic.group_by('Sex', 'Embarked').sum().sort('Sex')

Sex,Embarked,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin
str,str,i64,i64,i64,str,f64,i64,i64,str,f64,str
"""female""","""Q""",13469,27,104,,291.5,8,8,,454.8585,
"""female""","""S""",89058,140,446,,5165.5,170,160,,7864.4085,
"""female""","""C""",31924,64,126,,1729.0,40,36,,5487.3958,
"""female""",,892,2,2,,100.0,0,0,,160.0,
"""male""","""S""",200438,77,1068,,11147.25,198,106,,9574.9903,
"""male""","""C""",42896,29,191,,2276.92,25,25,,4584.9004,
"""male""","""Q""",18709,3,120,,495.0,25,5,,567.3958,


In [296]:
titanic.group_by('Sex', 'Embarked').len().sort('Sex')

Sex,Embarked,len
str,str,u32
"""female""","""C""",73
"""female""",,2
"""female""","""Q""",36
"""female""","""S""",203
"""male""","""Q""",41
"""male""","""C""",95
"""male""","""S""",441


In [297]:
df = pl.DataFrame(
    {
        'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
        'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
        'C': [1,2,3,4,5,6,7,8]
    }
)
df

A,B,C
str,str,i64
"""foo""","""one""",1
"""bar""","""one""",2
"""foo""","""two""",3
"""bar""","""three""",4
"""foo""","""two""",5
"""bar""","""two""",6
"""foo""","""one""",7
"""foo""","""three""",8


In [298]:

df.pivot(
    values='C',
    index='A',
    columns='B',
    aggregate_function='sum'
)

A,one,two,three
str,i64,i64,i64
"""foo""",8,8,8
"""bar""",2,6,4


In [299]:
df.pivot(
    values='C',
    index='A',
    columns='B',
    aggregate_function='sum'
).melt(
  'A',
  ['one', 'two', 'three']
)

A,variable,value
str,str,i64
"""foo""","""one""",8
"""bar""","""one""",2
"""foo""","""two""",8
"""bar""","""two""",6
"""foo""","""three""",8
"""bar""","""three""",4


## 欠損値

In [300]:
df_missing = pl.DataFrame(
    {'A': [1, 2, None, 4, None, None], 'B': [np.nan, 4, 3, 2, np.nan, None]}
)

df_missing

A,B
i64,f64
1.0,
2.0,4.0
,3.0
4.0,2.0
,
,


In [301]:
df_missing.with_columns(
    A_is_null=pl.col('A').is_null(),
    B_is_null=pl.col('B').is_null(),
    B_is_nan=pl.col('B').is_nan(),
)

A,B,A_is_null,B_is_null,B_is_nan
i64,f64,bool,bool,bool
1.0,,False,False,True
2.0,4.0,False,False,False
,3.0,True,False,False
4.0,2.0,False,False,False
,,True,False,True
,,True,True,


In [302]:
df_missing.with_columns(
    A_is_null=pl.col('A').is_null(),
    B_is_null=pl.col('B').is_null(),
    B_is_nan=pl.col('B').is_nan(),
).sum()

A,B,A_is_null,B_is_null,B_is_nan
i64,f64,u32,u32,u32
7,,3,1,2


In [303]:
df_missing.null_count()


A,B
u32,u32
3,1


In [304]:
df_missing.fill_null(9999999).fill_nan(111111111)


A,B
i64,f64
1,111111111.0
2,4.0
9999999,3.0
4,2.0
9999999,111111111.0
9999999,9999999.0


In [305]:
df_missing.fill_nan(pl.col('A'))



A,B
i64,f64
1.0,1.0
2.0,4.0
,3.0
4.0,2.0
,
,


## テキストデータを扱う


In [306]:
df_str = pl.DataFrame(
    {'orig': ['A', 'B', 'AabaAllpeallpe', 'Baca', None, 'CABA', 'dog', 'cat']}
)
df_str

orig
str
"""A"""
"""B"""
"""AabaAllpeallpe"""
"""Baca"""
""
"""CABA"""
"""dog"""
"""cat"""


In [307]:
df_str.with_columns(
    len1 = df_str.get_column('orig').str.len_bytes(),
    len2 = df_str.get_column('orig').str.len_chars(),
    lower = df_str.get_column('orig').str.to_lowercase()
)

orig,len1,len2,lower
str,u32,u32,str
"""A""",1.0,1.0,"""a"""
"""B""",1.0,1.0,"""b"""
"""AabaAllpeallpe""",14.0,14.0,"""aabaallpeallpe"""
"""Baca""",4.0,4.0,"""baca"""
,,,
"""CABA""",4.0,4.0,"""caba"""
"""dog""",3.0,3.0,"""dog"""
"""cat""",3.0,3.0,"""cat"""


In [308]:
df_str.select(pl.col('orig') + '_' + pl.col('orig'))


orig
str
"""A_A"""
"""B_B"""
"""AabaAllpeallpe_AabaAllpeallpe"""
"""Baca_Baca"""
""
"""CABA_CABA"""
"""dog_dog"""
"""cat_cat"""


## 時系列データを扱う


In [309]:
pl.datetime_range(
    datetime.datetime(2024, 4, 1),
    datetime.datetime(2024, 4, 10),
    '1d',
    eager=True, # seriesを返す
)

literal
datetime[μs]
2024-04-01 00:00:00
2024-04-02 00:00:00
2024-04-03 00:00:00
2024-04-04 00:00:00
2024-04-05 00:00:00
2024-04-06 00:00:00
2024-04-07 00:00:00
2024-04-08 00:00:00
2024-04-09 00:00:00
2024-04-10 00:00:00


In [310]:
pl.datetime_range(
    datetime.datetime(2024, 1, 1),
    datetime.datetime(2024, 1, 2),
    datetime.timedelta(days=0, hours=2),
    eager=True,
)

literal
datetime[μs]
2024-01-01 00:00:00
2024-01-01 02:00:00
2024-01-01 04:00:00
2024-01-01 06:00:00
2024-01-01 08:00:00
…
2024-01-01 16:00:00
2024-01-01 18:00:00
2024-01-01 20:00:00
2024-01-01 22:00:00


In [311]:
df_drange = pl.DataFrame(
    {
        'drange_str': [
            '2022-01-01 00:01:00',
            '2022-01-02 12:00:00',
            '2022-02-04 13:03:00',
            '2022-02-05 14:00:04',
            '2023-04-07 15:00:00',
            '2024-02-08 16:00:02',
            '2025-02-10 22:01:00',
        ]
    }
)
df_drange

drange_str
str
"""2022-01-01 00:01:00"""
"""2022-01-02 12:00:00"""
"""2022-02-04 13:03:00"""
"""2022-02-05 14:00:04"""
"""2023-04-07 15:00:00"""
"""2024-02-08 16:00:02"""
"""2025-02-10 22:01:00"""


In [312]:
new_column = pl.col('drange_str').str.strptime(pl.Datetime).alias('drange')
df_drange.with_columns(new_column)


drange_str,drange
str,datetime[μs]
"""2022-01-01 00:01:00""",2022-01-01 00:01:00
"""2022-01-02 12:00:00""",2022-01-02 12:00:00
"""2022-02-04 13:03:00""",2022-02-04 13:03:00
"""2022-02-05 14:00:04""",2022-02-05 14:00:04
"""2023-04-07 15:00:00""",2023-04-07 15:00:00
"""2024-02-08 16:00:02""",2024-02-08 16:00:02
"""2025-02-10 22:01:00""",2025-02-10 22:01:00


In [313]:
df_drange = df_drange.select(new_column)

df_drange.select(
    pl.col('drange'),
    pl.col('drange').dt.year().alias('year'),
    pl.col('drange').dt.month().alias('month'),
    pl.col('drange').dt.day().alias('day'),
    pl.col('drange').dt.hour().alias('hour'),
    pl.col('drange').dt.minute().alias('minute'),
    pl.col('drange').dt.second().alias('second'),
    pl.col('drange').dt.epoch().alias('epoch'),
)

drange,year,month,day,hour,minute,second,epoch
datetime[μs],i32,i8,i8,i8,i8,i8,i64
2022-01-01 00:01:00,2022,1,1,0,1,0,1640995260000000
2022-01-02 12:00:00,2022,1,2,12,0,0,1641124800000000
2022-02-04 13:03:00,2022,2,4,13,3,0,1643979780000000
2022-02-05 14:00:04,2022,2,5,14,0,4,1644069604000000
2023-04-07 15:00:00,2023,4,7,15,0,0,1680879600000000
2024-02-08 16:00:02,2024,2,8,16,0,2,1707408002000000
2025-02-10 22:01:00,2025,2,10,22,1,0,1739224860000000
