In [5]:
import polars as pl
import numpy as np
import pandas as pd

In [3]:
num_rows = 5000

rng = np.random.default_rng(seed=7)

building_data = {
    'sqft': rng.exponential(scale=1000, size=num_rows),
    'year': rng.integers(low=1995, high=2023, size=num_rows),
    'building_type': rng.choice(['A', 'B', 'C'], size=num_rows)
}

buildings = pl.DataFrame(building_data)
buildings

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""
…,…,…
710.435755,2003,"""C"""
408.872783,2009,"""C"""
57.562059,2019,"""C"""
3728.088949,2020,"""C"""


In [4]:
buildings.schema

Schema([('sqft', Float64), ('year', Int64), ('building_type', String)])

In [5]:
buildings.head()

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""


In [6]:
buildings.describe()

statistic,sqft,year,building_type
str,f64,f64,str
"""count""",5000.0,5000.0,"""5000"""
"""null_count""",0.0,0.0,"""0"""
"""mean""",994.094456,2008.5258,
"""std""",1016.641569,8.062353,
"""min""",1.133256,1995.0,"""A"""
"""25%""",286.807549,2001.0,
"""50%""",669.406964,2009.0,
"""75%""",1342.909782,2015.0,
"""max""",9307.793917,2022.0,"""C"""


### Context - Select, Filter and Groupby/Aggregation --> Verb
### Expressions --> noun

In [10]:
buildings.select('sqft').head()

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754


In [9]:
buildings.select(pl.col('sqft')).head()

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754


In [14]:
buildings.select(pl.col('sqft').sort()/1000).head()

sqft
f64
0.001133
0.001152
0.001429
0.001439
0.001505


In [17]:
buildings.select('sqft').sort(by='sqft').head()

sqft
f64
1.133256
1.152109
1.42909
1.438589
1.504648


In [18]:
after_2015 =buildings.filter(pl.col('year')>2015)
after_2015.shape

(1230, 3)

In [19]:
after_2015.select(pl.col('year').min())

year
i64
2016


In [23]:
buildings.group_by("building_type").agg(
      [
          pl.mean("sqft").alias("mean_sqft"),
          pl.median("year").alias("median_year"),
          pl.len(),
      ]
  )

building_type,mean_sqft,median_year,len
str,f64,f64,u32
"""C""",999.854722,2009.0,1692
"""A""",989.539918,2009.0,1653
"""B""",992.754444,2009.0,1655


In [24]:
pl.__version__

'1.22.0'

### LAZY API

In [2]:
num_rows = 5000
rng = np.random.default_rng(seed=7)

buildings = {
      "sqft": rng.exponential(scale=1000, size=num_rows),
      "price": rng.exponential(scale=100_000, size=num_rows),
      "year": rng.integers(low=1995, high=2023, size=num_rows),
      "building_type": rng.choice(["A", "B", "C"], size=num_rows),
   }

buildings_lazy = pl.LazyFrame(buildings)
buildings_lazy

In [6]:
df = pd.DataFrame(buildings)
pldf = pl.LazyFrame(df)

In [9]:
lazy_query = (
    buildings_lazy.with_columns((pl.col('price')/pl.col('sqft')).alias('price_per_sqft'))
    .filter(pl.col('price_per_sqft')>100)
    .filter(pl.col('year')< 2010)
)

lazy_query

In [12]:
buildings_lazy.with_columns((pl.col('price')/pl.col('sqft')).alias('price_per_sqft')).filter(pl.col('price_per_sqft')>100).filter(pl.col('year')< 2010)

In [13]:
lazy_query.collect()

sqft,price,year,building_type,price_per_sqft
f64,f64,i64,str,f64
9.753627,31876.709467,1996,"""B""",3268.19045
575.332756,157836.308297,2000,"""A""",274.339166
541.135894,160706.384529,2004,"""C""",296.979717
312.145612,118260.959791,2002,"""C""",378.86472
1223.566418,155763.172528,1995,"""B""",127.302589
…,…,…,…,…
300.07268,79730.200062,2008,"""B""",265.702962
45.167911,112098.176874,2009,"""B""",2481.810063
99.773084,69661.948942,2008,"""C""",698.203822
197.842124,107045.996211,2005,"""A""",541.067767


In [15]:
lazy_query.collect().select(pl.col(['price_per_sqft', 'year'])).describe()

statistic,price_per_sqft,year
str,f64,f64
"""count""",1317.0,1317.0
"""null_count""",0.0,0.0
"""mean""",1400.622815,2002.003037
"""std""",5755.888716,4.324595
"""min""",100.02061,1995.0
"""25%""",166.351274,1998.0
"""50%""",296.71958,2002.0
"""75%""",744.552161,2006.0
"""max""",90314.966163,2009.0


In [2]:
import requests
import pathlib

def download_file(file_url: str, local_file_path: pathlib.Path) -> None:
    """Download a file and save it with the specified file name."""
    response = requests.get(file_url)
    if response:
        local_file_path.write_bytes(response.content)
        print(f"File successfully downloaded and stored at: {local_file_path}")
    else:
        raise requests.exceptions.RequestException(
            f"Failed to download the file. Status code: {response.status_code}"
        )

In [None]:
url = "https://data.wa.gov/api/views/f6w7-q2d2/rows.csv?accessType=DOWNLOAD"
local_file_path = pathlib.Path("electric_cars.csv")
#download_file(url, local_file_path)

File successfully downloaded and stored at: electric_cars.csv


In [7]:
lazy_car_data = pl.scan_csv('electric_cars.csv')
lazy_car_data

In [8]:
lazy_car_data.schema

  lazy_car_data.schema


Schema([('VIN (1-10)', String),
        ('County', String),
        ('City', String),
        ('State', String),
        ('Postal Code', Int64),
        ('Model Year', Int64),
        ('Make', String),
        ('Model', String),
        ('Electric Vehicle Type', String),
        ('Clean Alternative Fuel Vehicle (CAFV) Eligibility', String),
        ('Electric Range', Int64),
        ('Base MSRP', Int64),
        ('Legislative District', Int64),
        ('DOL Vehicle ID', Int64),
        ('Vehicle Location', String),
        ('Electric Utility', String),
        ('2020 Census Tract', Int64)])

In [9]:
lazy_car_query = (
    lazy_car_data.filter(pl.col('Model Year') >= 2018)
    .filter(pl.col('Electric Vehicle Type') == 'Battery Electric Vehicle (BEV)')
    .group_by(['State', "Make"])
    .agg(
        pl.mean('Electric Range').alias('Average Electric Range'),
        pl.min('Model Year').alias('Olderst Model Year'),
        pl.len().alias('Number of Cars')
    ).filter(pl.col('Average Electric Range') > 0)
    .filter(pl.col('Number of Cars') > 5)
    .sort(pl.col('Number of Cars'), descending=True)
)

In [10]:
lazy_car_query.collect()

State,Make,Average Electric Range,Olderst Model Year,Number of Cars
str,str,f64,i64,u32
"""WA""","""TESLA""",53.335737,2018,90297
"""WA""","""CHEVROLET""",81.730205,2018,9952
"""WA""","""NISSAN""",61.468657,2018,8072
"""WA""","""FORD""",0.087631,2018,7988
"""WA""","""KIA""",31.267383,2018,7162
…,…,…,…,…
"""MD""","""TESLA""",31.625,2018,16
"""TX""","""TESLA""",62.933333,2018,15
"""NC""","""TESLA""",37.0,2018,13
"""FL""","""TESLA""",93.625,2018,8


#### Working with various data sources

In [13]:
import polars as pl
data = pl.DataFrame({
    "A": [1, 2, 3, 4, 5],
    "B": [6, 7, 8, 9, 10],
})

data.write_csv('data.csv')
data.write_ndjson('data.json')
data.write_parquet('data.parquet')

In [18]:
df = pd.read_csv('data.csv')
lf = pl.from_dataframe(df).lazy()

In [25]:
lf.collect_schema()

Schema([('A', Int64), ('B', Int64)])