In [3]:
import pandas as pd  # Import the Pandas library
import numpy as np  # Import the NumPy library
import time  # Import the time module to measure execution time

In [4]:
# Create a large DataFrame with random integers
np.random.seed(0)  # Set seed for reproducibility
data = np.random.randint(1, 100, size=(1000000, 1))  # Generate random data
df = pd.DataFrame(data, columns=['Values'])  # Create a DataFrame

# Define a custom function to apply
def custom_function(x):
    return x * 2 + 3

# Measure the time taken to apply the custom function using apply
start_time = time.time()  # Record the start time
df['Apply_Result'] = df['Values'].apply(custom_function)  # Apply the custom function using apply
time_apply = time.time() - start_time  # Calculate the time taken

# Measure the time taken to apply the custom function using vectorized operations
start_time = time.time()  # Record the start time
df['Vectorized_Result'] = custom_function(df['Values'])  # Apply the custom function using vectorized operations
time_vectorized = time.time() - start_time  # Calculate the time taken

# Print the time taken for both methods
print("Time taken using apply:", time_apply, "seconds")
print("Time taken using vectorized operations:", time_vectorized, "seconds")

Time taken using apply: 0.20052480697631836 seconds
Time taken using vectorized operations: 0.0 seconds


In [6]:
# Create a sample DataFrame
np.random.seed(0)  # Set seed for reproducibility
data = {
    'A': np.random.randint(1, 100, size=100000),
    'B': np.random.randint(1, 100, size=100000)
}
df = pd.DataFrame(data)

# Define the condition
condition = 50

# Filter rows using a for loop
start_time = time.time()  # Record the start time
filtered_rows_loop = []
for index, row in df.iterrows():
    if row['A'] > condition:
        filtered_rows_loop.append(row)
filtered_df_loop = pd.DataFrame(filtered_rows_loop)
time_for_loop = time.time() - start_time  # Calculate the time taken

# Filter rows using boolean indexing
start_time = time.time()  # Record the start time
filtered_df_bool = df[df['A'] > condition]
time_boolean_indexing = time.time() - start_time  # Calculate the time taken

# Print the time taken for both methods
print("Time taken using for loop:", time_for_loop, "seconds")
print("Time taken using boolean indexing:", time_boolean_indexing, "seconds")

Time taken using for loop: 3.429757595062256 seconds
Time taken using boolean indexing: 0.0053768157958984375 seconds
