# Chapter 1: Getting Started with Python Polars 

### Importing the Polars library

In [2]:
import polars as pl

### The Polars DataFrame

In [3]:
df = pl.DataFrame({
    'nums': [1,2,3,4,5],
    'letters': ['a','b','c','d','e']
})
df.head()

nums,letters
i64,str
1,"""a"""
2,"""b"""
3,"""c"""
4,"""d"""
5,"""e"""


In [48]:
df = pl.read_csv('../data/titanic_dataset.csv')
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [14]:
df.schema

{'PassengerId': Int64,
 'Survived': Int64,
 'Pclass': Int64,
 'Name': Utf8,
 'Sex': Utf8,
 'Age': Float64,
 'SibSp': Int64,
 'Parch': Int64,
 'Ticket': Utf8,
 'Fare': Float64,
 'Cabin': Utf8,
 'Embarked': Utf8}

In [15]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
df.dtypes

[Int64,
 Int64,
 Int64,
 Utf8,
 Utf8,
 Float64,
 Int64,
 Int64,
 Utf8,
 Float64,
 Utf8,
 Utf8]

In [11]:
df.shape

(891, 12)

In [16]:
df.height

891

In [17]:
df.width

12

In [12]:
df.flags

{'PassengerId': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Survived': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Pclass': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Name': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Sex': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Age': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'SibSp': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Parch': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Ticket': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Fare': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Cabin': {'SORTED_ASC': False, 'SORTED_DESC': False},
 'Embarked': {'SORTED_ASC': False, 'SORTED_DESC': False}}

In [49]:
df = df.sort(['PassengerId'], descending=True)
df['PassengerId']['SORTED_ASC']=True
df.flags

TypeError: cannot use "'SORTED_ASC'" for indexing

### Processing datasets larger than RAM

In [2]:
import polars as pl 

In [10]:
taxi_trips = (
    pl.scan_csv('~/Downloads/Taxi_Trips.csv')
    .collect(streaming=True)
)
taxi_trips.head()

Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
str,str,str,str,i64,f64,i64,i64,i64,i64,f64,f64,f64,f64,f64,str,str,f64,f64,str,f64,f64,str
"""2c4c7c96c03236…","""f68fe1183e6d87…","""10/23/2016 10:…","""10/23/2016 10:…",360,1.1,17031320100,17031320100,32,32,6.75,0.0,0.0,1.5,8.25,"""Cash""","""Medallion Leas…",41.884987,-87.620993,"""POINT (-87.620…",41.884987,-87.620993,"""POINT (-87.620…"
"""d3a30f8612e040…","""97ca0053c70788…","""10/23/2016 07:…","""10/23/2016 07:…",300,1.0,17031320100,17031081700,32,8,6.25,0.0,0.0,0.0,6.25,"""Cash""","""Medallion Leas…",41.884987,-87.620993,"""POINT (-87.620…",41.892042,-87.631864,"""POINT (-87.631…"
"""95178b9e52c980…","""f0b96329a7e390…","""10/27/2016 08:…","""10/27/2016 09:…",636,2.23,17031833100,17031081403,28,8,9.75,0.0,0.0,0.0,9.75,"""Cash""","""City Service""",41.879067,-87.657005,"""POINT (-87.657…",41.890922,-87.618868,"""POINT (-87.618…"
"""b221f753f4fce4…","""11b7c42b4edd22…","""10/27/2016 08:…","""10/27/2016 08:…",1000,2.13,17031320600,17031330100,32,33,10.75,0.0,0.0,1.5,12.25,"""Cash""","""Chicago Carria…",41.870607,-87.622173,"""POINT (-87.622…",41.85935,-87.617358,"""POINT (-87.617…"
"""d6d73a306f0e46…","""d49495206048d4…","""10/27/2016 05:…","""10/27/2016 05:…",540,1.5,17031081900,17031833000,8,28,7.5,0.0,0.0,0.0,7.5,"""Cash""","""Taxi Affiliati…",41.897984,-87.641492,"""POINT (-87.641…",41.885281,-87.657233,"""POINT (-87.657…"


In [7]:
%%time
trip_total_by_pay_type = (
    pl.scan_csv('~/Downloads/Taxi_Trips.csv')
    .group_by('Payment Type')
    .agg(pl.col('Trip Total').sum())
    .collect()
)
trip_total_by_pay_type.head()

CPU times: user 57.7 s, sys: 1min 16s, total: 2min 13s
Wall time: 1min 23s


Payment Type,Trip Total
str,f64
"""Unknown""",25932000.0
"""Dispute""",1388400.0
"""Cash""",1460600000.0
"""No Charge""",13089000.0
"""Split""",64668.43
