In [1]:
import pandas as pd
import numpy as np
import sys

## Foundations for efficiencies

Built-in functions

In [3]:
# Range
# Range from 0-5
print(list(range(6)))

# List of odd numbers from 1 to 11
print([*range(1,12,2)])

[0, 1, 2, 3, 4, 5]
[1, 3, 5, 7, 9, 11]


In [5]:
# Enumerate
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']
indexed_names = []

# Simple example
for i,name in enumerate(names):
    index_name = (i,name)
    indexed_names.append(index_name) 
print(indexed_names)

# List comprehension
print([(i,name) for i,name in enumerate(names)])

# list starting with one
print([*enumerate(names, 1)])

[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(1, 'Jerry'), (2, 'Kramer'), (3, 'Elaine'), (4, 'George'), (5, 'Newman')]


In [8]:
# Map
names_map  = map(str.upper, names)
print([*names_map])

# Lambda
print(list(map(lambda x: x.lower(), ["A", "B", "C"])))

['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']
['a', 'b', 'c']


In [14]:
# Numpy
nums = [[1, 2, 3, 4, 5],[6, 7, 8, 9, 10]]
nums = np.array(nums)

#Numpy array is faster than python List because of npArray homogenity (Same type)
print(nums.dtype)

#Numpy has special indexing
# Example Print second row of nums
print(nums[:,1])

#Numpy array has boolean indexing
# Example Print all values large then 6
print(nums[nums > 6])

#Numpy allow to do math operations easier
print(nums + 1)

int32
[2 7]
[ 7  8  9 10]
[[ 2  3  4  5  6]
 [ 7  8  9 10 11]]


## Timing and profiling code

Comparer times with timeit

In [21]:
# Compare Unpack with Copr
%timeit nums_list_comp = [num for num in range(51)]

%timeit nums_list_unpack = [*range(51)]

3.92 µs ± 688 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.12 µs ± 57.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [30]:
%%timeit -r5 -n1

l = []
for i in range(51):
    if i % 2 == 0:
        l.append(i)
print(l)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
The slowest run took 11.78 times longer than the fastest. This could mean that an intermediate result is being cached.
70.4 µs ± 93.2 µs per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [20]:
# Specify runTimes and Loops
heroes = ['A','B','C','D','E','F']
%timeit -r5 -n25 set(heroes)

902 ns ± 98.8 ns per loop (mean ± std. dev. of 5 runs, 25 loops each)


Profiling Using Python line_profiling (Remember to PIP)

In [22]:
%load_ext line_profiler

In [24]:
# Random Function
def convert_units(heroes, heights, weights):

    new_hts = [ht * 0.39370  for ht in heights]
    new_wts = [wt * 2.20462  for wt in weights]

    hero_data = {}

    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])

    return hero_data

heroes = ['A', 'B', 'C']
heights = [1.7, 1.8, 1.9]
weights = [80, 87, 130]

In [25]:
%lprun -f convert_units convert_units(heroes, heights, weights)

Timer unit: 1e-07 s

Total time: 3.14e-05 s
File: <ipython-input-24-36b8cb35486a>
Function: convert_units at line 2

Line #      Hits         Time  Per Hit   % Time  Line Contents
     2                                           def convert_units(heroes, heights, weights):
     3                                           
     4         1        102.0    102.0     32.5      new_hts = [ht * 0.39370  for ht in heights]
     5         1         69.0     69.0     22.0      new_wts = [wt * 2.20462  for wt in weights]
     6                                           
     7         1         11.0     11.0      3.5      hero_data = {}
     8                                           
     9         4         75.0     18.8     23.9      for i,hero in enumerate(heroes):
    10         3         49.0     16.3     15.6          hero_data[hero] = (new_hts[i], new_wts[i])
    11                                           
    12         1          8.0      8.0      2.5      return hero_data

Memory Profiling Python

In [27]:
# DirtyWay
x = [*range(1000)]
sys.getsizeof(x)

9104

Memory Profiling with memory_profiler (Remember to pip)

In [28]:
%load_ext memory_profiler

In [29]:
# Import function from file 
from bmi_lists import calc_bmi_lists

In [None]:
%mprun -f calc_bmi_lists calc_bmi_lists(sample_indices,hts,wts)

## Gaining efficiencies

Zip builting function

In [6]:
# Dataset Pokemons
names = ['Abomasnow', 'Abra', 'Absol', 'Accelgor', 'Aerodactyl', 'Aggron', 'Aipom', 'Alakazam', 'Alomomola', 'Altaria', 'Amaura', 'Ambipom', 'Amoonguss', 'Ampharos', 'Anorith']
primary_types = ['Grass', 'Psychic', 'Dark', 'Bug', 'Rock', 'Steel', 'Normal', 'Psychic', 'Water', 'Dragon', 'Rock', 'Normal', 'Grass', 'Electric', 'Rock']
secondary_types = ['Ice', None, None, None, 'Flying', 'Rock', None, None, None, 'Flying', 'Ice', None, 'Poison', None, 'Bug']

names_types = zip(names, primary_types, secondary_types)
names_types_list = [*names_types]
print(*names_types_list, sep='\n')

('Abomasnow', 'Grass', 'Ice')
('Abra', 'Psychic', None)
('Absol', 'Dark', None)
('Accelgor', 'Bug', None)
('Aerodactyl', 'Rock', 'Flying')
('Aggron', 'Steel', 'Rock')
('Aipom', 'Normal', None)
('Alakazam', 'Psychic', None)
('Alomomola', 'Water', None)
('Altaria', 'Dragon', 'Flying')
('Amaura', 'Rock', 'Ice')
('Ambipom', 'Normal', None)
('Amoonguss', 'Grass', 'Poison')
('Ampharos', 'Electric', None)
('Anorith', 'Rock', 'Bug')


Counter

In [11]:
from collections import Counter
count_types = Counter(primary_types)
print(count_types)

Counter({'Rock': 3, 'Grass': 2, 'Psychic': 2, 'Normal': 2, 'Dark': 1, 'Bug': 1, 'Steel': 1, 'Water': 1, 'Dragon': 1, 'Electric': 1})


Combinations

In [19]:
from itertools import combinations
possible_combinations = [*combinations(names,2)]
print(possible_combinations)

[('Abomasnow', 'Abra'), ('Abomasnow', 'Absol'), ('Abomasnow', 'Accelgor'), ('Abomasnow', 'Aerodactyl'), ('Abomasnow', 'Aggron'), ('Abomasnow', 'Aipom'), ('Abomasnow', 'Alakazam'), ('Abomasnow', 'Alomomola'), ('Abomasnow', 'Altaria'), ('Abomasnow', 'Amaura'), ('Abomasnow', 'Ambipom'), ('Abomasnow', 'Amoonguss'), ('Abomasnow', 'Ampharos'), ('Abomasnow', 'Anorith'), ('Abra', 'Absol'), ('Abra', 'Accelgor'), ('Abra', 'Aerodactyl'), ('Abra', 'Aggron'), ('Abra', 'Aipom'), ('Abra', 'Alakazam'), ('Abra', 'Alomomola'), ('Abra', 'Altaria'), ('Abra', 'Amaura'), ('Abra', 'Ambipom'), ('Abra', 'Amoonguss'), ('Abra', 'Ampharos'), ('Abra', 'Anorith'), ('Absol', 'Accelgor'), ('Absol', 'Aerodactyl'), ('Absol', 'Aggron'), ('Absol', 'Aipom'), ('Absol', 'Alakazam'), ('Absol', 'Alomomola'), ('Absol', 'Altaria'), ('Absol', 'Amaura'), ('Absol', 'Ambipom'), ('Absol', 'Amoonguss'), ('Absol', 'Ampharos'), ('Absol', 'Anorith'), ('Accelgor', 'Aerodactyl'), ('Accelgor', 'Aggron'), ('Accelgor', 'Aipom'), ('Accelgor',

Sets (DataType with distinct instances)

In [22]:
# Dataset
ash_pokedex = ['Pikachu', 'Bulbasaur', 'Koffing', 'Spearow', 'Vulpix', 'Wigglytuff', 'Zubat', 'Rattata', 'Psyduck', 'Squirtle'] 
misty_pokedex = ['Krabby', 'Horsea', 'Slowbro', 'Tentacool', 'Vaporeon', 'Magikarp', 'Poliwag', 'Starmie', 'Psyduck', 'Squirtle']

ash_set = set(ash_pokedex)
misty_set = set(misty_pokedex)

# Find the Pokémon that exist in both sets
both = ash_set.intersection(misty_set)
print(both)

# Find the Pokémon that Ash has and Misty does not have
ash_only = ash_set.difference(misty_set)
print(ash_only)

# Find the Pokémon that are in only one set (not both)
unique_to_set = ash_set.symmetric_difference(misty_set)
print(unique_to_set)

# Find element in set is faster than lists
%timeit 'Pikachu' in misty_set
%timeit 'Pikachu' in misty_pokedex



{'Psyduck', 'Squirtle'}
{'Koffing', 'Pikachu', 'Rattata', 'Vulpix', 'Zubat', 'Bulbasaur', 'Spearow', 'Wigglytuff'}
{'Magikarp', 'Koffing', 'Pikachu', 'Poliwag', 'Rattata', 'Krabby', 'Vulpix', 'Slowbro', 'Vaporeon', 'Zubat', 'Bulbasaur', 'Spearow', 'Tentacool', 'Horsea', 'Wigglytuff', 'Starmie'}
64.1 ns ± 1.61 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
247 ns ± 11.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## Basic pandas optimizations

In [2]:
# Dataframe 
df = {'Team': {0: 'PIT', 1: 'PIT', 2: 'PIT', 3: 'PIT', 4: 'PIT'},
      'League': {0: 'NL', 1: 'NL', 2: 'NL', 3: 'NL', 4: 'NL'},
      'Year': {0: 2012, 1: 2011, 2: 2010, 3: 2009, 4: 2008},
      'RS': {0: 651, 1: 610, 2: 587, 3: 636, 4: 735},
      'RA': {0: 674, 1: 712, 2: 866, 3: 768, 4: 884},
      'W': {0: 79, 1: 72, 2: 57, 3: 62, 4: 67},
      'G': {0: 162, 1: 162, 2: 162, 3: 161, 4: 162},
      'Playoffs': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}}
df = pd.DataFrame(df)
df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,G,Playoffs
0,PIT,NL,2012,651,674,79,162,0
1,PIT,NL,2011,610,712,72,162,0
2,PIT,NL,2010,587,866,57,162,0
3,PIT,NL,2009,636,768,62,161,0
4,PIT,NL,2008,735,884,67,162,0


Iterrows

In [5]:
for index, row in df.iterrows():
    print(index)
    print(row)
    print(type(row), '\n')

0
Team         PIT
League        NL
Year        2012
RS           651
RA           674
W             79
G            162
Playoffs       0
Name: 0, dtype: object
<class 'pandas.core.series.Series'> 

1
Team         PIT
League        NL
Year        2011
RS           610
RA           712
W             72
G            162
Playoffs       0
Name: 1, dtype: object
<class 'pandas.core.series.Series'> 

2
Team         PIT
League        NL
Year        2010
RS           587
RA           866
W             57
G            162
Playoffs       0
Name: 2, dtype: object
<class 'pandas.core.series.Series'> 

3
Team         PIT
League        NL
Year        2009
RS           636
RA           768
W             62
G            161
Playoffs       0
Name: 3, dtype: object
<class 'pandas.core.series.Series'> 

4
Team         PIT
League        NL
Year        2008
RS           735
RA           884
W             67
G            162
Playoffs       0
Name: 4, dtype: object
<class 'pandas.core.series.Series'> 



In [9]:
for i,row in df.iterrows():
    print(row['Year'])
    print('RS ->',row['RS'])
    print('RA ->',row['RA'], '\n')

2012
RS -> 651
RA -> 674 

2011
RS -> 610
RA -> 712 

2010
RS -> 587
RA -> 866 

2009
RS -> 636
RA -> 768 

2008
RS -> 735
RA -> 884 



.Itertuples (Itertuples is faster than Iterrows)

In [10]:
for row in df.itertuples():
  i = row.Index
  year = row.Year
  wins = row.W
  print(i, year, wins)

0 2012 79
1 2011 72
2 2010 57
3 2009 62
4 2008 67


.Apply (Better then loops)

In [12]:
df2 = {'RS': {2012: 697, 2011: 707, 2010: 802, 2009: 803, 2008: 774},
 'RA': {2012: 577, 2011: 614, 2010: 649, 2009: 754, 2008: 671},
 'W': {2012: 90, 2011: 91, 2010: 96, 2009: 84, 2008: 97},
 'Playoffs': {2012: 0, 2011: 1, 2010: 1, 2009: 0, 2008: 1}}
df2 = pd.DataFrame(df2)

# Sum all cols
stat_totals = df2.apply(sum, axis=0)
print(stat_totals)

RS          3783
RA          3265
W            458
Playoffs       3
dtype: int64


In [16]:
# Custom Function
def text_playoffs(num_playoffs): 
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No' 

# Apply
textual_playoffs = df2.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs)

2012     No
2011    Yes
2010    Yes
2009     No
2008    Yes
dtype: object


In [19]:
# Custom Function
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

# Create a win percentage Series 
win_percs = df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)

# Append a new column to dbacks_df
df["WP"] = win_percs
print(df, '\n')

  Team League  Year   RS   RA   W    G  Playoffs    WP
0  PIT     NL  2012  651  674  79  162         0  0.49
1  PIT     NL  2011  610  712  72  162         0  0.44
2  PIT     NL  2010  587  866  57  162         0  0.35
3  PIT     NL  2009  636  768  62  161         0  0.39
4  PIT     NL  2008  735  884  67  162         0  0.41 



Pandas is built on top of Numpy and NumpyArrays broadcasting is faster then other loop methods

In [20]:
# Faster method with numpyArray broadcasting
win_percs_np = calc_win_perc(df['W'].values, df['G'].values)
df["WP"] = win_percs_np
print(df, '\n')

  Team League  Year   RS   RA   W    G  Playoffs    WP
0  PIT     NL  2012  651  674  79  162         0  0.49
1  PIT     NL  2011  610  712  72  162         0  0.44
2  PIT     NL  2010  587  866  57  162         0  0.35
3  PIT     NL  2009  636  768  62  161         0  0.39
4  PIT     NL  2008  735  884  67  162         0  0.41 



Comparison Itertuples / Apply Lambda / NpArray

In [22]:
win_perc_preds_loop = []

# Custom Function
def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

# Use a loop and .itertuples() to collect each row's predicted win percentage
for row in df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(df['RS'].values, df['RA'].values)
df['WP_preds'] = win_perc_preds_np
print(df.head())

  Team League  Year   RS   RA   W    G  Playoffs    WP  WP_preds
0  PIT     NL  2012  651  674  79  162         0  0.49      0.48
1  PIT     NL  2011  610  712  72  162         0  0.44      0.42
2  PIT     NL  2010  587  866  57  162         0  0.35      0.31
3  PIT     NL  2009  636  768  62  161         0  0.39      0.41
4  PIT     NL  2008  735  884  67  162         0  0.41      0.41


In [23]:
%%timeit
for row in df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

1.93 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
%%timeit
win_perc_preds_apply = df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

1.99 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [25]:
%%timeit
win_perc_preds_np = predict_win_perc(df['RS'].values, df['RA'].values)
df['WP_preds'] = win_perc_preds_np

376 µs ± 58.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
