## Imports

In [1]:
import pandas as pd
import gc

## Read fines.csv

In [2]:
df = pd.read_csv('../data/fines.csv')
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987
2,7184TT36RUS,1,2100.0,Ford,Focus,1991
3,X582HE161RUS,2,2000.0,Ford,Focus,2017
4,92918M178RUS,1,5700.0,Ford,Focus,1991
...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019
926,E456TYRUS,2,100.0,Toyota,Corolla,1984
927,X789CBRUS,1,560.0,Skoda,Octavia,2012
928,O987PMRUS,2,1200.0,Ford,Focus,1995


## Iterations: 
In all the following subtasks, you need to calculate fines/refund*year for each row and create a new column with the calculated data and measure the time using the magic command %%timeit in the cell

### Loop

In [3]:
def loopper(df):
    res = []
    for i in range(0, len(df)):
        res.append(df.iloc[i].Fines / df.iloc[i].Refund * df.iloc[i].Year)
    df['Processed_data'] = res

In [4]:
%%timeit
loopper(df)

398 ms ± 6.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Processed_data
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
2,7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
3,X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
4,92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
926,E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
927,X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
928,O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


### Iterrows

In [6]:
def iterrowser(df):
    res = []
    for r in df.iterrows():
        res.append(r[1].Fines / r[1].Refund * r[1].Year)
    df['Processed_data'] = res

In [7]:
%%timeit
iterrowser(df)

122 ms ± 30.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Processed_data
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
2,7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
3,X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
4,92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
926,E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
927,X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
928,O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


### Apply

In [9]:
def applyer(df):
    df['Processed_data'] = df.apply(lambda r: r.Fines / r.Refund * r.Year, axis='columns')

In [10]:
%%timeit
applyer(df)

46.8 ms ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Processed_data
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
2,7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
3,X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
4,92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
926,E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
927,X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
928,O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


### With series

In [12]:
def serieser(df):
    df['Processed_data'] = df.Fines / df.Refund * df.Year

In [13]:
%%timeit
serieser(df)

767 µs ± 135 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Processed_data
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
2,7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
3,X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
4,92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
926,E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
927,X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
928,O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


### With series and .values

In [15]:
def seriesvaluer(df):
    df['Processed_data'] = df.Fines.values / df.Refund.values * df.Year.values

In [16]:
%%timeit
seriesvaluer(df)

383 µs ± 119 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Processed_data
0,Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
1,E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
2,7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
3,X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
4,92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...,...
925,A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
926,E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
927,X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
928,O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


## Indexing: 
measure the time using the magic command %%timeit in the cell

get row for 'O136HO197RUS'

In [18]:
%%timeit
df.loc[df['CarNumber'] == 'O136HO197RUS']

544 µs ± 205 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


set index

In [19]:
df.set_index('CarNumber', inplace=True)

get same row again

In [20]:
%%timeit
df.loc[df.index == 'O136HO197RUS']

256 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Downcasting:

Run df.info(memory_usage=’deep’), pay attention to the Dtype and the memory usage

In [21]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to P654MBRUS
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Refund          930 non-null    int64  
 1   Fines           930 non-null    float64
 2   Make            930 non-null    object 
 3   Model           919 non-null    object 
 4   Year            930 non-null    int64  
 5   Processed_data  930 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 203.7 KB


Make a copy() of your initial dataframe into another dataframe optimized

In [22]:
optimized = df.copy()

Downcast from float64 to float32 for all the columns
Downcast from int64 to the smallest numerical dtype possible

In [23]:
optimized

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Processed_data
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...
A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


In [24]:
floats = optimized.select_dtypes('float').columns
floats

Index(['Fines', 'Processed_data'], dtype='object')

In [25]:
optimized[floats] = optimized[floats].apply(pd.to_numeric, downcast='float')

In [26]:
optimized

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Processed_data
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...
A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


In [27]:
ints = optimized.select_dtypes('integer').columns
ints

Index(['Refund', 'Year'], dtype='object')

In [28]:
optimized[ints] = optimized[ints].apply(pd.to_numeric, downcast='integer')
optimized

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,Processed_data
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,2015,3.224000e+06
E432XX77RUS,1,6500.0,Toyota,Camry,1987,1.291550e+07
7184TT36RUS,1,2100.0,Ford,Focus,1991,4.181100e+06
X582HE161RUS,2,2000.0,Ford,Focus,2017,2.017000e+06
92918M178RUS,1,5700.0,Ford,Focus,1991,1.134870e+07
...,...,...,...,...,...,...
A123HKRUS,1,5600.0,Ford,Focus,2019,1.130640e+07
E456TYRUS,2,100.0,Toyota,Corolla,1984,9.920000e+04
X789CBRUS,1,560.0,Skoda,Octavia,2012,1.126720e+06
O987PMRUS,2,1200.0,Ford,Focus,1995,1.197000e+06


un info(memory_usage=’deep’) for your new dataframe, pay attention to the
Dtype and the memory usage

In [29]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to P654MBRUS
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Refund          930 non-null    int8   
 1   Fines           930 non-null    float32
 2   Make            930 non-null    object 
 3   Model           919 non-null    object 
 4   Year            930 non-null    int16  
 5   Processed_data  930 non-null    float32
dtypes: float32(2), int16(1), int8(1), object(2)
memory usage: 184.6 KB


## Categories:
Change the object type columns to the type category

In [30]:
objs = optimized.select_dtypes(['object']).columns
objs

Index(['Make', 'Model'], dtype='object')

In [31]:
optimized[objs] = optimized[objs].apply(lambda x: x.astype('category'))

In [32]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to P654MBRUS
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Refund          930 non-null    int8    
 1   Fines           930 non-null    float32 
 2   Make            930 non-null    category
 3   Model           919 non-null    category
 4   Year            930 non-null    int16   
 5   Processed_data  930 non-null    float32 
dtypes: category(2), float32(2), int16(1), int8(1)
memory usage: 75.9 KB


## Memory clean
using %reset_selective and the library gc clean the memory of your initial
dataframe only

In [33]:
gc.is_tracked(df)

True

In [34]:
who_ls

['applyer',
 'df',
 'floats',
 'gc',
 'ints',
 'iterrowser',
 'loopper',
 'objs',
 'optimized',
 'os',
 'pd',
 'serieser',
 'seriesvaluer',
 'sys']

In [35]:
%reset_selective -f df

In [36]:
who_ls

['applyer',
 'floats',
 'gc',
 'ints',
 'iterrowser',
 'loopper',
 'objs',
 'optimized',
 'os',
 'pd',
 'serieser',
 'seriesvaluer',
 'sys']

In [37]:
gc.collect()

0

In [38]:
who_ls

['applyer',
 'floats',
 'gc',
 'ints',
 'iterrowser',
 'loopper',
 'objs',
 'optimized',
 'os',
 'pd',
 'serieser',
 'seriesvaluer',
 'sys']

In [41]:
gc.is_tracked(optimized)

True

In [39]:
gc.get_objects()

[<weakref at 0x7faab2aa1f90; dead>,
 ['SPACE',
  'EXCLAMATION MARK',
  'QUOTATION MARK',
  'NUMBER SIGN',
  'DOLLAR SIGN',
  'PERCENT SIGN',
  'AMPERSAND',
  'APOSTROPHE',
  'LEFT PARENTHESIS',
  'RIGHT PARENTHESIS',
  'ASTERISK',
  'PLUS SIGN',
  'COMMA',
  'HYPHEN-MINUS',
  'FULL STOP',
  'SOLIDUS',
  'DIGIT ZERO',
  'DIGIT ONE',
  'DIGIT TWO',
  'DIGIT THREE',
  'DIGIT FOUR',
  'DIGIT FIVE',
  'DIGIT SIX',
  'DIGIT SEVEN',
  'DIGIT EIGHT',
  'DIGIT NINE',
  'COLON',
  'SEMICOLON',
  'LESS-THAN SIGN',
  'EQUALS SIGN',
  'GREATER-THAN SIGN',
  'QUESTION MARK',
  'COMMERCIAL AT',
  'LATIN CAPITAL LETTER A',
  'LATIN CAPITAL LETTER B',
  'LATIN CAPITAL LETTER C',
  'LATIN CAPITAL LETTER D',
  'LATIN CAPITAL LETTER E',
  'LATIN CAPITAL LETTER F',
  'LATIN CAPITAL LETTER G',
  'LATIN CAPITAL LETTER H',
  'LATIN CAPITAL LETTER I',
  'LATIN CAPITAL LETTER J',
  'LATIN CAPITAL LETTER K',
  'LATIN CAPITAL LETTER L',
  'LATIN CAPITAL LETTER M',
  'LATIN CAPITAL LETTER N',
  'LATIN CAPITAL LETT