In [9]:
import pandas as pd
import numpy as np
import datetime
import time
import multiprocessing as mp

In [4]:
x = datetime.datetime(1993, 12, 9, 0, 0)

simulated_dates = []

for i in range(2000000):
    x += datetime.timedelta(minutes=60)
    simulated_dates.append(x)

df = pd.DataFrame(simulated_dates, columns=['Dates'])

print(df.shape)

(2000000, 1)


Unnamed: 0,Dates
0,1993-12-09 01:00:00
1,1993-12-09 02:00:00
2,1993-12-09 03:00:00
3,1993-12-09 04:00:00
4,1993-12-09 05:00:00


In [12]:
# Function to add an hour to a given chunk of data
def add_hour(dates):
    return dates + pd.Timedelta(hours=1)

# Without multiprocessing
start_time = time.time()
df['Dates'] = df['Dates'].apply(add_hour)
end_time = time.time()

print("Without multiprocessing:", end_time - start_time, "seconds")

Without multiprocessing: 15.934876680374146 seconds


In [13]:
# With multiprocessing
def process_chunk(chunk):
    return add_hour(chunk)

start_time = time.time()
num_cores = mp.cpu_count()
chunk_size = len(df) // num_cores
chunks = [df['Dates'][i:i+chunk_size] for i in range(0, len(df), chunk_size)]

with mp.Pool(processes=num_cores) as pool:
    df['Dates'] = pd.concat(pool.map(process_chunk, chunks))
end_time = time.time()

print("With multiprocessing:", end_time - start_time, "seconds")


With multiprocessing: 0.1811535358428955 seconds
