In [1]:
import pandas as pd
import gc

## Read fines.csv

In [2]:
fines = pd.read_csv('../ex04/fines.csv')

In [3]:
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


## Iterations

In [4]:
def loop(df):
    result = []
    for i in range(0, len(df)):
        row = df.iloc[i]
        metric = row['Fines'] / row['Refund'] * row['Year']
        result.append(metric)
    
    df['Calculated'] = result


In [5]:
%timeit loop(fines)
fines.drop('Calculated', axis=1)

71.5 ms ± 15.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


In [28]:
def with_iterrows(df):
    result = []
    for _, row in df.iterrows():
        metric = row['Fines'] / row['Refund'] * row['Year']
        result.append(metric)
    
    df['Calculated'] = result

In [41]:
%timeit with_iterrows(fines)
fines.drop('Calculated', axis=1)

82.4 ms ± 28.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


In [32]:
def with_apply(df):
    return df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

In [42]:
%timeit fines['Calculated'] = with_apply(fines)
fines.drop('Calculated', axis=1)

29.4 ms ± 5.31 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


In [None]:
def with_vectorized(df):
    df['Calculated'] = df['Fines'] / df['Refund'] * df['Year']

In [43]:
%timeit with_vectorized(fines)
fines.drop('Calculated', axis=1)

1.06 ms ± 308 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


In [49]:
def with_values(df):
    df['Calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values

In [50]:
%timeit with_values(fines)
fines.drop('Calculated', axis=1)

407 μs ± 95.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1.0,200.0,Ford,Focus,1981
921,T6418M116RUS,1.0,500.0,Ford,Focus,1992
922,E42377152RUS,2.0,4000.0,Ford,Focus,2007
923,C514X938RUS,2.0,1000.0,Ford,Focus,2005


## Indexing

In [7]:
%timeit fines[fines['CarNumber'] == 'O136HO197RUS']

458 μs ± 132 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
fines.set_index('CarNumber', inplace=True)

In [12]:
%timeit fines[fines.index == 'O136HO197RUS']

322 μs ± 66.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Downcasting

In [6]:
fines.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   925 non-null    object 
 1   Refund      925 non-null    float64
 2   Fines       925 non-null    float64
 3   Make        925 non-null    object 
 4   Model       912 non-null    object 
 5   Year        925 non-null    int64  
 6   Calculated  925 non-null    float64
dtypes: float64(3), int64(1), object(3)
memory usage: 202.7 KB


In [7]:
optimized = fines.copy()

In [11]:
float_cols = optimized.select_dtypes(include='float64').columns
optimized[float_cols] = optimized[float_cols].apply(pd.to_numeric, downcast='float')

In [12]:
int_cols = optimized.select_dtypes(include='int64').columns
optimized[int_cols] = optimized[int_cols].apply(pd.to_numeric, downcast='integer')

In [13]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   925 non-null    object 
 1   Refund      925 non-null    float32
 2   Fines       925 non-null    float32
 3   Make        925 non-null    object 
 4   Model       912 non-null    object 
 5   Year        925 non-null    int16  
 6   Calculated  925 non-null    float64
dtypes: float32(2), float64(1), int16(1), object(3)
memory usage: 190.1 KB


## categories

In [14]:
float_cols = optimized.select_dtypes(include='object').columns
optimized[float_cols] = optimized[float_cols].astype('category')

In [15]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 925 entries, 0 to 924
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   CarNumber   925 non-null    category
 1   Refund      925 non-null    float32 
 2   Fines       925 non-null    float32 
 3   Make        925 non-null    category
 4   Model       912 non-null    category
 5   Year        925 non-null    int16   
 6   Calculated  925 non-null    float64 
dtypes: category(3), float32(2), float64(1), int16(1)
memory usage: 73.4 KB


## memory clean

In [16]:
%reset_selective -f fines

In [17]:
gc.collect()

0