# Polars Basics
- Reading the CSV with Polars In-Memory Mode
- Reading the CSv with Polars Lazy Mode

In [32]:
import polars as pl

csv_file = './data/yellow_tripdata_2023-02.csv'
parquet_file = './data/yellow_tripdata_2023-02.parquet'

In [34]:
df = pl.read_parquet(parquet_file)
df.write_csv(f'./data/yellow_tripdata_2023-02.csv')
print(pl.read_csv(csv_file))

shape: (2_913_955, 19)
┌──────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐
│ VendorID ┆ tpep_pickup_ ┆ tpep_dropoff ┆ passenger_co ┆ … ┆ improvement_ ┆ total_amount ┆ congestion_s ┆ Airport_fee │
│ ---      ┆ datetime     ┆ _datetime    ┆ unt          ┆   ┆ surcharge    ┆ ---          ┆ urcharge     ┆ ---         │
│ i64      ┆ ---          ┆ ---          ┆ ---          ┆   ┆ ---          ┆ f64          ┆ ---          ┆ f64         │
│          ┆ str          ┆ str          ┆ i64          ┆   ┆ f64          ┆              ┆ f64          ┆             │
╞══════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡
│ 1        ┆ 2023-02-01T0 ┆ 2023-02-01T0 ┆ 2            ┆ … ┆ 1.0          ┆ 9.4          ┆ 2.5          ┆ 0.0         │
│          ┆ 0:32:53.0000 ┆ 0:34:34.0000 ┆              ┆   ┆              ┆              ┆              ┆             │
│        

In [35]:
lf = pl.scan_csv(csv_file, try_parse_dates = True)
lf.head()

lf.collect()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
i64,datetime[μs],datetime[μs],i64,f64,i64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,18.8,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
2,2023-02-01 00:12:28,2023-02-01 00:25:46,1,3.22,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,


# Selecting Data: In-Memory vs Lazy Mode

In [30]:
# In-memory mode data selecting
(
    df
    .select(["tpep_pickup_datetime", "tpep_dropoff_datetime"])
    .head()
)

tpep_pickup_datetime,tpep_dropoff_datetime
datetime[ns],datetime[ns]
2023-02-01 00:32:53,2023-02-01 00:34:34
2023-02-01 00:35:16,2023-02-01 00:35:30
2023-02-01 00:35:16,2023-02-01 00:35:30
2023-02-01 00:29:33,2023-02-01 01:01:38
2023-02-01 00:12:28,2023-02-01 00:25:46


In [31]:
# data selecting from lazyframe
(
    lf
    .select(["tpep_pickup_datetime", "tpep_dropoff_datetime"])
    .head()
    .collect()
    # .explain(optimized=True)
)

tpep_pickup_datetime,tpep_dropoff_datetime
datetime[μs],datetime[μs]
2023-02-01 00:32:53,2023-02-01 00:34:34
2023-02-01 00:35:16,2023-02-01 00:35:30
2023-02-01 00:35:16,2023-02-01 00:35:30
2023-02-01 00:29:33,2023-02-01 01:01:38
2023-02-01 00:12:28,2023-02-01 00:25:46
