In [1]:
import pandas as pd
import numpy as np
import time

In [4]:
# Function to create a list of DataFrames
def create_dataframes(num_dfs, num_rows, num_cols):
    return [pd.DataFrame(np.random.randn(num_rows, num_cols)) for _ in range(num_dfs)]

# Number of DataFrames, rows, and columns
num_dfs = 100
num_rows = 1000
num_cols = 10

# Create DataFrames
dfs = create_dataframes(num_dfs, num_rows, num_cols)

# Measure time for pd.concat method
start_time = time.time()
result_concat = pd.concat(dfs, axis=0)
end_time = time.time()
concat_time = end_time - start_time

# Measure time for for-loop method
start_time = time.time()
result_for_loop = dfs[0]
for df in dfs[1:]:
    result_for_loop = pd.concat([result_for_loop, df], axis=0)
end_time = time.time()
for_loop_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using pd.concat: {concat_time:.6f} seconds")
print(f"Time taken using for loop: {for_loop_time:.6f} seconds")

Time taken using pd.concat: 0.013920 seconds
Time taken using for loop: 0.216248 seconds


In [5]:
# Create a sample DataFrame
num_rows = 1000000
df = pd.DataFrame({
    'A': np.random.randint(0, 100, size=num_rows),
    'B': np.random.randn(num_rows),
    'C': np.random.rand(num_rows)
})

# Define the condition
condition = 'A > 50 and B < 0'

# Measure time for query method
start_time = time.time()
result_query = df.query(condition)
end_time = time.time()
query_time = end_time - start_time

# Measure time for boolean indexing
start_time = time.time()
result_boolean_indexing = df[(df['A'] > 50) & (df['B'] < 0)]
end_time = time.time()
boolean_indexing_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using query method: {query_time:.6f} seconds")
print(f"Time taken using boolean indexing: {boolean_indexing_time:.6f} seconds")

Time taken using query method: 0.075012 seconds
Time taken using boolean indexing: 0.033163 seconds


In [6]:
# Create a time series DataFrame
num_rows = 1000000
date_range = pd.date_range(start='1/1/2020', periods=num_rows, freq='T')
df = pd.DataFrame({'value': np.random.randn(num_rows)}, index=date_range)

# Resampling frequency
resample_freq = 'H'

# Measure time for resample method
start_time = time.time()
resampled_df = df.resample(resample_freq).mean()
end_time = time.time()
resample_time = end_time - start_time

# Measure time for manual resampling
start_time = time.time()
manual_resampled_df = df.groupby(pd.Grouper(freq=resample_freq)).mean()
end_time = time.time()
manual_resample_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using resample method: {resample_time:.6f} seconds")
print(f"Time taken using manual resampling: {manual_resample_time:.6f} seconds")

Time taken using resample method: 0.059435 seconds
Time taken using manual resampling: 0.026432 seconds


In [7]:
# Create a sample DataFrame
num_rows = 1000000
df = pd.DataFrame({'value': np.random.randn(num_rows)})

# Measure time for cumsum method
start_time = time.time()
cumsum_result = df['value'].cumsum()
end_time = time.time()
cumsum_time = end_time - start_time

# Measure time for for loop method
start_time = time.time()
cumsum_for_loop = np.zeros(num_rows)
cumsum_for_loop[0] = df['value'].iloc[0]
for i in range(1, num_rows):
    cumsum_for_loop[i] = cumsum_for_loop[i-1] + df['value'].iloc[i]
end_time = time.time()
for_loop_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using cumsum method: {cumsum_time:.6f} seconds")
print(f"Time taken using for loop: {for_loop_time:.6f} seconds")

Time taken using cumsum method: 0.017408 seconds
Time taken using for loop: 13.826880 seconds


In [8]:
# Create a sample DataFrame
num_rows = 1000000
df = pd.DataFrame({
    'text': ['example_string'] * num_rows
})

# Measure time for str accessor method
start_time = time.time()
str_accessor_result = df['text'].str.upper()
end_time = time.time()
str_accessor_time = end_time - start_time

# Define a custom function to apply
def to_upper(text):
    return text.upper()

# Measure time for apply method
start_time = time.time()
apply_result = df['text'].apply(to_upper)
end_time = time.time()
apply_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using str accessor: {str_accessor_time:.6f} seconds")
print(f"Time taken using apply method: {apply_time:.6f} seconds")

Time taken using str accessor: 0.465502 seconds
Time taken using apply method: 0.400365 seconds


In [9]:
# Create a sample DataFrame
num_rows = 1000000
df = pd.DataFrame({
    'A': np.random.choice(['foo', 'bar', 'baz'], size=num_rows),
    'B': np.random.choice(['one', 'two', 'three'], size=num_rows),
    'values': np.random.randn(num_rows)
})

# Measure time for pivot_table method
start_time = time.time()
pivot_table_result = df.pivot_table(index='A', columns='B', values='values', aggfunc='mean')
end_time = time.time()
pivot_table_time = end_time - start_time

# Measure time for manual reshaping using for loops
start_time = time.time()
result = {}
for a in df['A'].unique():
    result[a] = {}
    for b in df['B'].unique():
        result[a][b] = df[(df['A'] == a) & (df['B'] == b)]['values'].mean()

manual_reshape_result = pd.DataFrame(result).T
end_time = time.time()
manual_reshape_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using pivot_table method: {pivot_table_time:.6f} seconds")
print(f"Time taken using manual reshaping: {manual_reshape_time:.6f} seconds")

Time taken using pivot_table method: 0.412065 seconds
Time taken using manual reshaping: 2.536218 seconds


In [10]:
# Create a sample DataFrame
num_rows = 1000000
df = pd.DataFrame({
    'A': np.random.randint(0, 100, size=num_rows),
    'B': np.random.randn(num_rows)
})

# Measure time for sort_values method
start_time = time.time()
sorted_df = df.sort_values(by='A')
end_time = time.time()
sort_values_time = end_time - start_time

# Define a custom sorting function
def sort_custom(df):
    return df.sort_values(by='A')

# Measure time for apply method
start_time = time.time()
apply_sorted_df = df.apply(lambda x: x).sort_values(by='A')
end_time = time.time()
apply_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using sort_values method: {sort_values_time:.6f} seconds")
print(f"Time taken using apply method: {apply_time:.6f} seconds")

Time taken using sort_values method: 0.188197 seconds
Time taken using apply method: 0.245891 seconds


In [11]:
# Create a time series DataFrame
num_rows = 1000000
date_range = pd.date_range(start='1/1/2020', periods=num_rows, freq='T')
df = pd.DataFrame({'value': np.random.randn(num_rows)}, index=date_range)

# Define the window size
window_size = 60

# Measure time for rolling method
start_time = time.time()
rolling_mean = df['value'].rolling(window=window_size).mean()
end_time = time.time()
rolling_time = end_time - start_time

# Measure time for manual rolling calculation
start_time = time.time()
manual_rolling_mean = df['value'].copy()
for i in range(window_size, num_rows):
    manual_rolling_mean.iloc[i] = df['value'].iloc[i-window_size:i].mean()
end_time = time.time()
manual_rolling_time = end_time - start_time

# Print the time taken for each method
print(f"Time taken using rolling method: {rolling_time:.6f} seconds")
print(f"Time taken using manual calculation: {manual_rolling_time:.6f} seconds")

Time taken using rolling method: 0.085488 seconds
Time taken using manual calculation: 251.220275 seconds


In [16]:
# Create a sample DataFrame
np.random.seed(0)
df = pd.DataFrame({
    'A': np.random.randint(1, 100, 1000),
    'B': np.random.rand(1000),
    'C': np.random.randint(1, 100, 1000)
})

# Define aggregation functions
aggregations = {
    'A': ['sum', 'mean', 'std'],
    'B': ['sum', 'mean', 'std'],
    'C': ['sum', 'mean', 'std']
}

# Timing the agg method
start_time_agg = time.time()
df_agg = df.agg(aggregations)
time_agg = time.time() - start_time_agg

# Timing the individual application of functions
start_time_individual = time.time()
results_individual = {
    'A_sum': df['A'].sum(),
    'A_mean': df['A'].mean(),
    'A_std': df['A'].std(),
    'B_sum': df['B'].sum(),
    'B_mean': df['B'].mean(),
    'B_std': df['B'].std(),
    'C_sum': df['C'].sum(),
    'C_mean': df['C'].mean(),
    'C_std': df['C'].std()
}
time_individual = time.time() - start_time_individual

# Print results
print(f"Time using agg method: {time_agg:.6f} seconds")
print(f"Time applying functions individually: {time_individual:.6f} seconds")
print("Aggregated results using agg method:")
print(df_agg)
print("Results applying functions individually:")
print(results_individual)

Time using agg method: 0.002640 seconds
Time applying functions individually: 0.000000 seconds
Aggregated results using agg method:
                 A           B             C
sum   49723.000000  509.199400  48276.000000
mean     49.723000    0.509199     48.276000
std      28.857183    0.296208     28.470799
Results applying functions individually:
{'A_sum': 49723, 'A_mean': 49.723, 'A_std': 28.857182953434812, 'B_sum': 509.19940043113445, 'B_mean': 0.5091994004311343, 'B_std': 0.2962083809189193, 'C_sum': 48276, 'C_mean': 48.276, 'C_std': 28.47079925837016}
