In [1]:
import time
import zlib

import numpy as np
from numpy import random
import pandas as pd

from tqdm import tqdm
from faker import Faker

In [2]:
UNIQUE_VALUES = 40_000
ROW_COUNT = 5_000_000

In [3]:
faker_gen = Faker()

In [4]:
company_names = []
company_names_genenator = (faker_gen.company() for _ in range(UNIQUE_VALUES))

# Oh yes, we want to track progress
for company in tqdm(company_names_genenator):
    company_names.append(company)

40000it [00:04, 8383.68it/s]


In [5]:
index_values = random.choice(company_names, size=ROW_COUNT)

In [6]:
zipcode_generator = (faker_gen.zipcode() for _ in range(ROW_COUNT))
zipcodes = []

# Oh yes, we want to track progress
for zipcode in tqdm(zipcode_generator):
    zipcodes.append(zipcode)

5000000it [00:19, 262369.96it/s]


In [7]:
headcount_changes = random.uniform(low=-5, high=20, size=ROW_COUNT)
headcount_changes = np.around(headcount_changes, decimals=1)

In [8]:
storage_sizes = random.randint(2**63, size=ROW_COUNT)

In [9]:
df = pd.DataFrame(
    index=index_values,
    data={"zipcode": zipcodes,
          "storage_size": storage_sizes,
          "headcount_change": headcount_changes,
          "sort_id": list(range(ROW_COUNT))}
)

In [10]:
df.dtypes

zipcode              object
storage_size          int64
headcount_change    float64
sort_id               int64
dtype: object

In [11]:
df

Unnamed: 0,zipcode,storage_size,headcount_change,sort_id
"Miller, Massey and Dominguez",98642,1309403095887968886,7.6,0
Jackson-Miller,30848,7527013096493984921,5.3,1
Joyce Ltd,72583,8724801935611110034,16.1,2
"Benson, Vargas and Robinson",84575,895200032852436667,1.5,3
King-Martinez,23547,1025412557692161869,-4.1,4
...,...,...,...,...
Calderon Group,67400,7835946918283624313,16.5,4999995
"Saunders, Nguyen and Kelley",88077,1651052092089589948,1.6,4999996
Williams-Espinoza,46322,1567068918233516510,3.6,4999997
Thomas and Sons,01607,5521337671676790891,5.2,4999998


In [12]:
RANDOM_ILOC = 4907
RANDOM_IDX = df.index[RANDOM_ILOC]
print(f"The random index is: `{RANDOM_IDX}`")

The random index is: `Mccoy LLC`


In [13]:
%timeit df.loc[RANDOM_IDX]

68 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%time sorted_df = df.reset_index().sort_values(["index", "sort_id"]).set_index("index")  # Stable sort

CPU times: user 6.62 s, sys: 741 ms, total: 7.36 s
Wall time: 7.4 s


In [15]:
%timeit sorted_df.loc[RANDOM_IDX]

229 µs ± 64.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
def custom_func(row) -> int:
    return zlib.crc32(row["zipcode"].encode("ascii") + str(row["storage_size"]).encode("ascii"))

def ops_per_slice(mini_df: pd.DataFrame) -> pd.DataFrame:
    mini_df["hash"] = mini_df.apply(custom_func, axis=1)
    mini_df["prev_change"] = mini_df["headcount_change"].shift()
    return mini_df

In [17]:
def operations_on_df(input_df: pd.DataFrame, index_values, percent: int = 1) -> pd.DataFrame:
    assert percent > 0
    assert percent <= 100

    # We just want a fraction of it for demo
    indexes = index_values[:int(percent * len(index_values) / 100)]
    
    result_dfs = []

    for index in tqdm(indexes):
        mini_df = input_df.loc[[index]]
        tmp_res = ops_per_slice(mini_df)
        result_dfs.append(tmp_res)
        
    return pd.concat(result_dfs)

In [18]:
unique_index_values = df.index.unique()

In [19]:
%time operations_on_df(df, unique_index_values)

100%|██████████| 303/303 [00:31<00:00,  9.56it/s]


CPU times: user 31.3 s, sys: 453 ms, total: 31.8 s
Wall time: 31.9 s


Unnamed: 0,zipcode,storage_size,headcount_change,sort_id,hash,prev_change
"Miller, Massey and Dominguez",98642,1309403095887968886,7.6,0,1269636874,
"Miller, Massey and Dominguez",63775,9045136122858270551,15.7,165408,2957381248,7.6
"Miller, Massey and Dominguez",71754,2309013742917052620,14.1,174504,1229437809,15.7
"Miller, Massey and Dominguez",66870,7227176668219853390,-4.0,235664,105493545,14.1
"Miller, Massey and Dominguez",96288,8854011211758293512,10.7,243327,1060640964,-4.0
...,...,...,...,...,...,...
Simmons Inc,81968,7971629664862632961,15.2,4969760,318594056,-3.5
Simmons Inc,77207,8135773933147396136,19.6,4978862,3354772453,15.2
Simmons Inc,24521,1005463141032933360,18.4,4985335,2648020905,19.6
Simmons Inc,83026,1874800644985251847,18.0,4991952,2593587407,18.4


In [20]:
%time operations_on_df(sorted_df, unique_index_values)

100%|██████████| 303/303 [00:03<00:00, 82.14it/s] 


CPU times: user 3.93 s, sys: 76.5 ms, total: 4 s
Wall time: 4 s


Unnamed: 0_level_0,zipcode,storage_size,headcount_change,sort_id,hash,prev_change
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Miller, Massey and Dominguez",98642,1309403095887968886,7.6,0,1269636874,
"Miller, Massey and Dominguez",63775,9045136122858270551,15.7,165408,2957381248,7.6
"Miller, Massey and Dominguez",71754,2309013742917052620,14.1,174504,1229437809,15.7
"Miller, Massey and Dominguez",66870,7227176668219853390,-4.0,235664,105493545,14.1
"Miller, Massey and Dominguez",96288,8854011211758293512,10.7,243327,1060640964,-4.0
...,...,...,...,...,...,...
Simmons Inc,81968,7971629664862632961,15.2,4969760,318594056,-3.5
Simmons Inc,77207,8135773933147396136,19.6,4978862,3354772453,15.2
Simmons Inc,24521,1005463141032933360,18.4,4985335,2648020905,19.6
Simmons Inc,83026,1874800644985251847,18.0,4991952,2593587407,18.4
