# Session 7

## Speed up Pandas

In [17]:
import pandas as pd
import numpy as np

### Create dataset

Random data about ficticious cats

In [2]:
def get_data(size = 20_000):
    df = pd.DataFrame()
    df['age'] = np.random.randint(0, 30, size)
    df['time_laying_down'] = np.random.randint(5,23, size)
    df['pct_sleeping'] = np.random.rand(size)
    df['favorite_food'] = np.random.choice(['fish', 'poultry', 'cheese', 'mice'], size)
    df['favorite_toy'] = np.random.choice(['ball', 'mouse', 'stick'], size)
    return df

### The problem

Reward calculation:

- If the cat is laying down for more than 20 hours AND sleeping for more than 50% it gets it's favorite toy
- Otherwise, it gets its favorite food
- If it is over 20 it gets the favorite food regardless

In [3]:
def reward_calc(row):
    if row['age'] >= 20:
        return row['favorite_food']
    if (row['time_laying_down'] > 20) & (row['pct_sleeping'] > .5):
        return row['favorite_toy']
    return row['favorite_food']

### Very Slow - Standard Loop

In [4]:
df = get_data()

In [5]:
%%timeit
for ind in range(0, len(df)):
    df.loc[ind, 'reward'] = reward_calc(df.loc[ind,:])

2.06 s ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Slow - Iterrows

In [6]:
df = get_data()

In [7]:
%%timeit

for index, row in df.iterrows():
    df.loc[index, 'reward'] = reward_calc(row)

1.11 s ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Medium - Apply

In [8]:
df = get_data()

In [9]:
%%timeit

df['reward'] = df.apply(reward_calc, axis=1)

172 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Fast - Pandas Vectorized

In [10]:
(df['time_laying_down'] > 20) & (df['pct_sleeping'] > .5)

0        False
1         True
2        False
3        False
4        False
         ...  
19995    False
19996    False
19997    False
19998    False
19999    False
Length: 20000, dtype: bool

In [11]:
df = get_data()

In [12]:
%%timeit

df['reward'] = df['favorite_food']
df.loc[(df['time_laying_down'] > 20) & (df['pct_sleeping'] > .5), 'reward'] = df['favorite_toy']

464 µs ± 5.03 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Fastest - Numpy Vectorization

In [13]:
df = get_data()

In [14]:
%%timeit

df['reward'] = df['favorite_food']
df.loc[(df['time_laying_down'].values > 20) & (df['pct_sleeping'].values > .5), 'reward'] = df['favorite_toy']

343 µs ± 4.27 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Efficient memory use

### Create Data

Random athletes in sports teams

In [15]:
def get_dataset(size = 10_000):
    df = pd.DataFrame()
    df['position'] = np.random.choice(['lef', 'middle', 'right'], size)
    df['age'] = np.random.randint(5, 50, size)
    df['team'] = np.random.choice(['red', 'yellow', 'blue', 'green'], size)
    df['win'] = np.random.choice(['yes', 'no'], size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

In [16]:
df = get_dataset(2_000_000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype  
---  ------    -----  
 0   position  object 
 1   age       int64  
 2   team      object 
 3   win       object 
 4   prob      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 76.3+ MB


In [17]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()

1.01 s ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.12 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### More efficient data types - categorical

In [18]:
df = get_dataset(2_000_000)
df['position'] = df['position'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   position  category
 1   age       int64   
 2   team      object  
 3   win       object  
 4   prob      float64 
dtypes: category(1), float64(1), int64(1), object(2)
memory usage: 62.9+ MB


In [19]:
df['team'] = df['team'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   position  category
 1   age       int64   
 2   team      category
 3   win       object  
 4   prob      float64 
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 49.6+ MB


In [20]:
%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()

862 ms ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
948 ms ± 46.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### More efficient data types - numerical

- int8 can store integers from -128 to 127
- int16 can store integers from -32,768 to 32,767
- int32 can store integers from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807

In [21]:
df = get_dataset(2_000_000)
df['position'] = df['position'].astype('category')
df['team'] = df['team'].astype('category')
df['age'] = df['age'].astype('int8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   position  category
 1   age       int8    
 2   team      category
 3   win       object  
 4   prob      float64 
dtypes: category(2), float64(1), int8(1), object(1)
memory usage: 36.2+ MB


In [22]:
df['prob'] = df['prob'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   position  category
 1   age       int8    
 2   team      category
 3   win       object  
 4   prob      float32 
dtypes: category(2), float32(1), int8(1), object(1)
memory usage: 28.6+ MB


### More efficient data types - bool

In [23]:
df['win'] = df['win'].map({'yes':True, 'no':False})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column    Dtype   
---  ------    -----   
 0   position  category
 1   age       int8    
 2   team      category
 3   win       bool    
 4   prob      float32 
dtypes: bool(1), category(2), float32(1), int8(1)
memory usage: 15.3 MB


### Speed comparison

In [24]:
def set_dtypes(df):
    df['position'] = df['position'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int8')
    df['prob'] = df['prob'].astype('float32')
    df['win'] = df['win'].map({'yes':True, 'no':False})
    return df

In [25]:
df = get_dataset(2_000_000)

%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank()
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank()
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank()

1.01 s ± 20.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.19 s ± 94.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.36 s ± 84.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
df = get_dataset(2_000_000)
df = set_dtypes(df)

%timeit df['age_rank'] = df.groupby(['team', 'position'])['age'].rank().astype('int32')
%timeit df['prob_rank'] = df.groupby(['team', 'position'])['prob'].rank().astype('int32')
%timeit df['win_prob_rank'] = df.groupby(['team', 'position', 'win'])['prob'].rank().astype('int32')

875 ms ± 16.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
934 ms ± 35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
955 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 8 columns):
 #   Column         Dtype   
---  ------         -----   
 0   position       category
 1   age            int8    
 2   team           category
 3   win            bool    
 4   prob           float32 
 5   age_rank       int32   
 6   prob_rank      int32   
 7   win_prob_rank  int32   
dtypes: bool(1), category(2), float32(1), int32(3), int8(1)
memory usage: 38.1 MB


# Outside of Pandas

There are many other libraries that perform faster than Pandas - PySpark, Vaex, Modin, Dask and Polars are examples.

## Polars

Looking at an example dataset from Kaggle (it is 9GB across 4 files - therefor no uplad to myCourses): https://www.kaggle.com/datasets/new-york-city/nyc-parking-tickets?resource=download



In [3]:
!pip install polars

Collecting polars
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/41/6e/38d062077df6d898afcc661420a83f77f0369beb38fc56c9535036d27768/polars-0.20.7-cp38-abi3-macosx_11_0_arm64.whl.metadata
  Downloading polars-0.20.7-cp38-abi3-macosx_11_0_arm64.whl.metadata (15 kB)
Downloading polars-0.20.7-cp38-abi3-macosx_11_0_arm64.whl (25.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.7


In [5]:
import polars as pl
from pathlib import Path

In [9]:
%%timeit

# Reading the file
df = pl.scan_csv(Path('data').joinpath(Path('Parking_Violations_Issued_-_Fiscal_Year_2017.csv')), ignore_errors = True).collect()

df

2.55 s ± 83.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# Scanning before reading
temp_df = pl.scan_csv(Path('data').joinpath(Path('Parking_Violations_Issued_-_Fiscal_Year_2017.csv')), ignore_errors = True)

# Filtering for rows
result_df = temp_df.filter(pl.col(['Registration State'])=="NY").collect()

result_df

Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,Street Code2,Street Code3,Vehicle Expiration Date,Violation Location,Violation Precinct,Issuer Precinct,Issuer Code,Issuer Command,Issuer Squad,Violation Time,Time First Observed,Violation County,Violation In Front Of Or Opposite,House Number,Street Name,Intersecting Street,Date First Observed,Law Section,Sub Division,Violation Legal Code,Days Parking In Effect,From Hours In Effect,To Hours In Effect,Vehicle Color,Unregistered Vehicle?,Vehicle Year,Meter Number,Feet From Curb,Violation Post Code,Violation Description,No Standing or Stopping Violation,Hydrant Violation,Double Parking Violation
i64,str,str,str,str,i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,i64,i64,str,i64,str,str,str,str,str
5092469481,"""GZH7067""","""NY""","""PAS""","""07/10/2016""",7,"""SUBN""","""TOYOT""","""V""",0,0,0,0,,0,0,0,,,"""0143A""",,"""BX""",,,"""ALLERTON AVE (…","""BARNES AVE""",0,1111,"""D""","""T""",,,,"""GY""",,2001,,0,,"""FAILURE TO STO…",,,
5092451658,"""GZH7067""","""NY""","""PAS""","""07/08/2016""",7,"""SUBN""","""TOYOT""","""V""",0,0,0,0,,0,0,0,,,"""0400P""",,"""BX""",,,"""ALLERTON AVE (…","""BARNES AVE""",0,1111,"""D""","""T""",,,,"""GY""",,2001,,0,,"""FAILURE TO STO…",,,
4006265037,"""FZX9232""","""NY""","""PAS""","""08/23/2016""",5,"""SUBN""","""FORD""","""V""",0,0,0,0,,0,0,0,,,"""0233P""",,"""BX""",,,"""SB WEBSTER AVE…","""94TH ST""",0,1111,"""C""","""T""",,,,"""BK""",,2004,,0,,"""BUS LANE VIOLA…",,,
8478629828,"""66623ME""","""NY""","""COM""","""06/14/2017""",47,"""REFG""","""MITSU""","""T""",10610,34330,34350,20180630,14,14,14,359594,"""T102""","""J""","""1120A""",,"""NY""","""O""","""330""","""7th Ave""",,0,408,"""l2""",,"""Y""","""0700A""","""0700P""","""WH""",,2007,,0,"""04""","""47-Double PKG-…",,,
7868300310,"""37033JV""","""NY""","""COM""","""11/21/2016""",69,"""DELV""","""INTER""","""T""",10510,34310,34330,20170228,13,13,13,364832,"""T102""","""M""","""0555P""",,"""NY""","""F""","""799""","""6th Ave""",,0,408,"""h1""",,"""Y""","""0700A""","""0700P""","""WHITE""",,2007,,0,"""31 6""","""69-Failure to …",,,
5096917368,"""FZD8593""","""NY""","""PAS""","""06/13/2017""",7,"""SUBN""","""ME/BE""","""V""",0,0,0,0,,0,0,0,,,"""0852P""",,"""QN""",,,"""NORTHERN BLVD …","""@ MARATHON PKW…",0,1111,"""D""","""T""",,,,"""WH""",,2012,,0,,"""FAILURE TO STO…",,,
4627113330,"""HCA5464""","""NY""","""OMS""","""11/21/2016""",36,"""SUBN""","""DODGE""","""V""",0,0,0,0,,0,0,0,,,"""1005A""",,"""QN""",,,"""NB BAISLEY BLV…","""5TH ST""",0,1180,"""B""","""T""",,,,"""BK""",,2016,,0,,"""PHTO SCHOOL ZN…",,,
1407740258,"""2513JMG""","""NY""","""COM""","""01/11/2017""",78,"""DELV""","""FRUEH""","""P""",0,40404,40404,20161130,106,106,106,960979,"""0106""","""0000""","""0015A""",,"""Q""",,"""126""","""ST 115 AVE""",,0,408,"""E2""",,"""BBBBBBB""","""ALL""","""ALL""","""WHITE""",0,2015,"""-""",0,,,,,
8009901763,"""13657MD""","""NY""","""COM""","""09/27/2016""",19,"""DELV""","""KENWO""","""T""",10510,34870,34890,20170331,18,18,18,357355,"""T400""","""A""","""0707A""",,"""NY""","""O""","""1365""","""6th Ave""",,0,408,"""c3""",,"""YYYYYYY""",,,"""RD""",,2013,,0,"""01 9""","""19-No Stand (b…",,,
4625926610,"""N102911C""","""NY""","""OMT""","""10/27/2016""",36,"""VAN""","""FORD""","""V""",0,0,0,0,,0,0,0,,,"""1022A""",,"""QN""",,,"""EB HORACE HARD…","""PRESSWAY @ PEC…",0,1180,"""B""","""T""",,,,"""WH""",,2008,,0,,"""PHTO SCHOOL ZN…",,,


In [16]:
result_df.shape

(8481061, 43)

In [20]:
%%timeit

df = pd.read_csv(Path('data').joinpath(Path('Parking_Violations_Issued_-_Fiscal_Year_2017.csv')))



22.3 s ± 934 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# More complex filtering using regular expression
temp_df = pl.scan_csv(Path('data'.joinpath(Path())), ignore_errors = True)

result_df = temp_df.filter(pl.col("Plate ID").str.contains(r"[a1]")).collect()

result_df.info()

With Polars we can choose if we want lazy or eagerly execution

In [None]:
%%timeit

temp_df = pl.scan_csv(Path('data'.joinpath(Path())), ignore_errors = True)

# Lazy selection of a particular column
result_df = temp_df.select(['Plate ID']).collect()

result_df

In [None]:
%%timeit

temp_df = pl.scan_csv(Path('data'.joinpath(Path())), ignore_errors = True)

# Selecting the Plate ID column
result_df = temp_df['Plate ID']

In [None]:
# Adding a column

temp_df = pl.scan_csv(Path('data'.joinpath(Path())), ignore_errors = True)

result_df = temp_df.with_column(pl.col("Plate ID").str.lengths().alias("plate_id_letter_count")).collect()

result_df

In [None]:
# Groupby
temp_df = pl.scan_csv(Path('data'.joinpath(Path())), ignore_errors = True)

result_df = temp_df.groupby("Registration State").agg(
    [
        pl.count(),
        pl.col("Violation Code").list(),

    ]
).sort('Registration State').collect()

result_df

In [None]:
# Convert "Issue Date" intoa date column, 
# Then group by Registration State and perform some aggregation

result_df = (temp_df
             .with_column(pl
                          .col("Issue Date")
                          .str.strptime(pl.Date, fmt="%m/%d/%Y"))
             .groupby("Registration State")
             .agg(
                [pl.first("Issue Date")]
                )
             .sort('Registration State'))


result_df.collect()

# Object-oriented programming in Python



## Classes

A class is a blueprint that helps to define a data-structure together with actions to be performed on this data.

Let's build a class for cats as an example.

In [1]:
class Cat:
    pass

To define attributes all cats must have you use the ` __init()__ ` method. Every time an object is created this is run. It always needs $self$ as first parameter.

In [None]:
class Cat:
    def __init__(self, name, age):
        self.name = name
        self.age = age
    

The attributes defined in ` __init()__ ` are specific to the individual instance of a cat. You can also define attributes that are the same for all instances.

In [1]:
class Cat:
    # class attribute
    species = "Felis catus"

    def __init__(self, name, age):
        self.name = name
        self.age = age

Let's create some cats:

In [5]:
a = Cat(name="Kitty", age=5)
b = Cat(name="Mieze", age=13)

print(f"Cat names: {a.name}, {b.name}")

Cat names Pussy, Mieze


Let's now add some actions our cats can perform.

In [6]:
class Cat:
    # class attribute
    species = "Felis catus"

    def __init__(self, name, age):
        self.name = name
        self.age = age
    
    def description(self):
        return f"{self.name} is {self.age} years old"

    def speak(self, sound):
        return f"{self.name} says {sound}"

In [8]:
a = Cat(name="Pussy", age=5)
b = Cat(name="Mieze", age=13)

print(a.description())
print(b.speak("Miau"))

Pussy is 5 years old
Mieze says Miau


Tailoring our cat class by using inheritance

In [9]:
class Siamese(Cat):
    def speak(self, sound="Maaauuuu"):
        return f"{self.name} says {sound}"

class Persian(Cat):
    def speak(self, sound="Miiauhhhh"):
        return f"{self.name} says {sound}"

In [10]:
a = Siamese(name="Pussy", age=5)
b = Persian(name="Mieze", age=13)

print(a.description())
print(b.speak())

Pussy is 5 years old
Mieze says Miiauhhhh


You can also call the parent classes methods

In [11]:
class BritishShorthair(Cat):
    def speak(self, sound="MauMau"):
        return super().speak(sound)

In [13]:
c = BritishShorthair("Lena", 2)
c.speak()

'Lena says MauMau'