In [2]:
import polars as pl
import numpy as np

In [3]:
num_rows = 5000

rng = np.random.default_rng(seed=7)

building_data = {
    'sqft': rng.exponential(scale=1000, size=num_rows),
    'year': rng.integers(low=1995, high=2023, size=num_rows),
    'building_type': rng.choice(['A', 'B', 'C'], size=num_rows)
}

buildings = pl.DataFrame(building_data)
buildings

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""
…,…,…
710.435755,2003,"""C"""
408.872783,2009,"""C"""
57.562059,2019,"""C"""
3728.088949,2020,"""C"""


In [4]:
buildings.schema

Schema([('sqft', Float64), ('year', Int64), ('building_type', String)])

In [5]:
buildings.head()

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""


In [6]:
buildings.describe()

statistic,sqft,year,building_type
str,f64,f64,str
"""count""",5000.0,5000.0,"""5000"""
"""null_count""",0.0,0.0,"""0"""
"""mean""",994.094456,2008.5258,
"""std""",1016.641569,8.062353,
"""min""",1.133256,1995.0,"""A"""
"""25%""",286.807549,2001.0,
"""50%""",669.406964,2009.0,
"""75%""",1342.909782,2015.0,
"""max""",9307.793917,2022.0,"""C"""


### Context - Select, Filter and Groupby/Aggregation --> Verb
### Expressions --> noun

In [10]:
buildings.select('sqft').head()

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754


In [9]:
buildings.select(pl.col('sqft')).head()

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754


In [14]:
buildings.select(pl.col('sqft').sort()/1000).head()

sqft
f64
0.001133
0.001152
0.001429
0.001439
0.001505


In [17]:
buildings.select('sqft').sort(by='sqft').head()

sqft
f64
1.133256
1.152109
1.42909
1.438589
1.504648


In [18]:
after_2015 =buildings.filter(pl.col('year')>2015)
after_2015.shape

(1230, 3)

In [19]:
after_2015.select(pl.col('year').min())

year
i64
2016


In [23]:
buildings.group_by("building_type").agg(
      [
          pl.mean("sqft").alias("mean_sqft"),
          pl.median("year").alias("median_year"),
          pl.len(),
      ]
  )

building_type,mean_sqft,median_year,len
str,f64,f64,u32
"""C""",999.854722,2009.0,1692
"""A""",989.539918,2009.0,1653
"""B""",992.754444,2009.0,1655


In [24]:
pl.__version__

'1.22.0'

### LAZY API