In [1]:
import polars as pl
csv_file = '../data/yellow_tripdata_2023-02.csv'
parquet_file = '../data/yellow_tripdata_2023-02.parquet'

In [2]:
df = pl.read_parquet(parquet_file)
df.head(2)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
i32,datetime[ns],datetime[ns],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0


# Operating on Multiple cols at the same time

In [3]:
# Naive method
eur_per_usd = 0.92  # As of 2024-05-27.
(
    df
    .select([
        # Payment amounts in USD, explicitly named as such.
        pl.col('fare_amount').name.suffix("_usd"),
        pl.col('extra').name.suffix("_usd"),
        pl.col('mta_tax').name.suffix("_usd"),
        pl.col('tip_amount').name.suffix("_usd"),
        pl.col('tolls_amount').name.suffix("_usd"),
        pl.col('improvement_surcharge').name.suffix("_usd"),
        pl.col('total_amount').name.suffix("_usd"),
        pl.col('congestion_surcharge').name.suffix("_usd"),
        pl.col('Airport_fee').name.suffix("_usd"),
        
        # Payment amounts, in Euros.
        (pl.col('fare_amount') * eur_per_usd).name.suffix("_eur"),
        (pl.col('extra') * eur_per_usd).name.suffix("_eur"),
        (pl.col('mta_tax') * eur_per_usd).name.suffix("_eur"),
        (pl.col('tip_amount') * eur_per_usd).name.suffix("_eur"),
        (pl.col('tolls_amount') * eur_per_usd).name.suffix("_eur"),
        (pl.col('improvement_surcharge') * eur_per_usd).name.suffix("_eur"),
        (pl.col('total_amount') * eur_per_usd).name.suffix("_eur"),
        (pl.col('congestion_surcharge') * eur_per_usd).name.suffix("_eur"),
        (pl.col('Airport_fee') * eur_per_usd).name.suffix("_eur"),
    ])
    .head()
)

fare_amount_usd,extra_usd,mta_tax_usd,tip_amount_usd,tolls_amount_usd,improvement_surcharge_usd,total_amount_usd,congestion_surcharge_usd,Airport_fee_usd,fare_amount_eur,extra_eur,mta_tax_eur,tip_amount_eur,tolls_amount_eur,improvement_surcharge_eur,total_amount_eur,congestion_surcharge_eur,Airport_fee_eur
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,4.048,3.22,0.46,0.0,0.0,0.92,8.648,2.3,0.0
-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0,-2.76,-0.92,-0.46,0.0,0.0,-0.92,-5.06,0.0,0.0
3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0,2.76,0.92,0.46,0.0,0.0,0.92,5.06,0.0,0.0
70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25,65.228,2.07,0.46,0.0,0.0,0.92,68.678,0.0,1.15
17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0,15.64,0.92,0.46,3.036,0.0,0.92,23.276,2.3,0.0


In [4]:
# Polars way of operating on multiple cols
eur_per_usd = 0.92
currency_columns = [
    "fare_amount",
    "extra",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "improvement_surcharge",
    "total_amount",
    "congestion_surcharge",
    "Airport_fee",
]
(
    df
    .select([
        pl.col(currency_columns).name.suffix("_usd"),
        (pl.col(currency_columns) * eur_per_usd).name.suffix("_eur"),
    ])
    .head()
)

fare_amount_usd,extra_usd,mta_tax_usd,tip_amount_usd,tolls_amount_usd,improvement_surcharge_usd,total_amount_usd,congestion_surcharge_usd,Airport_fee_usd,fare_amount_eur,extra_eur,mta_tax_eur,tip_amount_eur,tolls_amount_eur,improvement_surcharge_eur,total_amount_eur,congestion_surcharge_eur,Airport_fee_eur
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,4.048,3.22,0.46,0.0,0.0,0.92,8.648,2.3,0.0
-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0,-2.76,-0.92,-0.46,0.0,0.0,-0.92,-5.06,0.0,0.0
3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0,2.76,0.92,0.46,0.0,0.0,0.92,5.06,0.0,0.0
70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25,65.228,2.07,0.46,0.0,0.0,0.92,68.678,0.0,1.15
17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0,15.64,0.92,0.46,3.036,0.0,0.92,23.276,2.3,0.0


# Selecting cols on the datatype

In [5]:
(
    df
    .select([
        pl.col(pl.Float64)
    ])
)

trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.3,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
0.0,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
0.0,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
18.8,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
3.22,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0
…,…,…,…,…,…,…,…,…,…
4.65,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,
2.47,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,
3.49,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,
2.13,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,


In [33]:
(
    df
    .select([
        pl.any_horizontal(pl.col(pl.Float64).lt(0).name.suffix('_mean')).mean()
    ])
)

trip_distance_mean
f64
0.00878


# Adding new cols with **.with_columns()**
- Using with_columns() to add a new col
- Overwriting already existing col with new values
- Adding new col with using argument for naming.

In [34]:
(
    df
    .with_columns(
        (pl.col('trip_distance') * 100).alias('new_col')
    )
)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,new_col
i32,datetime[ns],datetime[ns],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,30.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,18.8,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25,1880.0
2,2023-02-01 00:12:28,2023-02-01 00:25:46,1,3.22,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0,322.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,,465.0
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,,247.0
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,,349.0
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,,213.0


In [35]:
# Adding new col with using argument for naming.
(
    df
    .with_columns(
        new_col_1 = (pl.col('trip_distance') * 10)
    )
)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,new_col_1
i32,datetime[ns],datetime[ns],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,3.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,18.8,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25,188.0
2,2023-02-01 00:12:28,2023-02-01 00:25:46,1,3.22,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0,32.2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,,46.5
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,,24.7
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,,34.9
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,,21.3


In [38]:
# Overwriting already existing col with new values
(
    df
    .with_columns(
        pl.col('passenger_count') * 10
    )
)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
i32,datetime[ns],datetime[ns],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,20,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,10,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,10,0.0,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,18.8,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
2,2023-02-01 00:12:28,2023-02-01 00:25:46,10,3.22,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,


# Using **.drop()** to drop the cols from DF

In [42]:
print(df.shape)
(
    df
    .drop(['trip_distance'])
)

(2913955, 19)


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
i32,datetime[ns],datetime[ns],i64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
2,2023-02-01 00:12:28,2023-02-01 00:25:46,1,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,


# Using **.rename()** to rename the cols

In [43]:
column_rename_mapping = {
    "VendorID": "vendor_id",
    "RatecodeID": "ratecode_id",
    "PULocationID": "pu_location_id",
    "DOLocationID": "do_location_id",
    "Airport_fee": "airport_fee",
}
(
    df
    .rename(column_rename_mapping)
    .head()
)

vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i32,datetime[ns],datetime[ns],i64,f64,i64,str,i32,i32,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,2023-02-01 00:32:53,2023-02-01 00:34:34,2,0.3,1,"""N""",142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2023-02-01 00:35:16,2023-02-01 00:35:30,1,0.0,1,"""N""",71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
1,2023-02-01 00:29:33,2023-02-01 01:01:38,0,18.8,1,"""N""",132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
2,2023-02-01 00:12:28,2023-02-01 00:25:46,1,3.22,1,"""N""",161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0
