In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
# Create a sample DataFrame
np.random.seed(0)  # Set seed for reproducibility
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D'], size=1000000),
    'Values': np.random.randint(1, 100, size=1000000)
}
df = pd.DataFrame(data)

# Define a custom aggregation function
def custom_aggregation(data):
    result = {}
    for category in data['Category'].unique():
        result[category] = data[data['Category'] == category]['Values'].sum()
    return result

# Aggregate data using the groupby method
start_time = time.time()  # Record the start time
groupby_result = df.groupby('Category')['Values'].sum()
time_groupby = time.time() - start_time  # Calculate the time taken

# Aggregate data using manual iteration
start_time = time.time()  # Record the start time
manual_result = custom_aggregation(df)
time_manual = time.time() - start_time  # Calculate the time taken

# Print the results
print("Aggregation result using groupby:")
print(groupby_result)
print("\nTime taken using groupby:", time_groupby, "seconds")

print("\nAggregation result using manual iteration:")
print(manual_result)
print("\nTime taken using manual iteration:", time_manual, "seconds")

Aggregation result using groupby:
Category
A    12541392
B    12440541
C    12477135
D    12502875
Name: Values, dtype: int32

Time taken using groupby: 0.08902144432067871 seconds

Aggregation result using manual iteration:
{'A': 12541392, 'D': 12502875, 'B': 12440541, 'C': 12477135}

Time taken using manual iteration: 0.2288367748260498 seconds


In [3]:
# Create two large DataFrames
np.random.seed(0)  # Set seed for reproducibility
data1 = {
    'Key': np.random.randint(1, 1000, size=1000),
    'Value1': np.random.randint(1, 100, size=1000)
}
data2 = {
    'Key': np.random.randint(1, 1000, size=1000),
    'Value2': np.random.randint(1, 100, size=1000)
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Perform merge using the merge method
start_time = time.time()  # Record the start time
merged_df = pd.merge(df1, df2, on='Key')
time_merge = time.time() - start_time  # Calculate the time taken

# Perform merge using a nested for loop
start_time = time.time()  # Record the start time
merged_data = []
for index1, row1 in df1.iterrows():
    for index2, row2 in df2.iterrows():
        if row1['Key'] == row2['Key']:
            merged_data.append({**row1, **row2})
merged_df_loop = pd.DataFrame(merged_data)
time_nested_loop = time.time() - start_time  # Calculate the time taken

# Print the time taken for both methods
print("Time taken using merge method:", time_merge, "seconds")
print("Time taken using nested for loop:", time_nested_loop, "seconds") 

Time taken using merge method: 0.021688222885131836 seconds
Time taken using nested for loop: 24.314834117889404 seconds


In [4]:
# Create a sample DataFrame with categorical data
np.random.seed(0)  # Set seed for reproducibility
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D'], size=1000000),
    'Values': np.random.randint(1, 100, size=1000000)
}
df = pd.DataFrame(data)

# Print memory usage before optimization
print("Memory usage before optimization:")
print(df.info(memory_usage='deep'))

# Convert the 'Category' column to the category data type
df['Category'] = df['Category'].astype('category')

# Print memory usage after optimization
print("\nMemory usage after optimization:")
print(df.info(memory_usage='deep'))

Memory usage before optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   Category  1000000 non-null  object
 1   Values    1000000 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 59.1 MB
None

Memory usage after optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype   
---  ------    --------------    -----   
 0   Category  1000000 non-null  category
 1   Values    1000000 non-null  int32   
dtypes: category(1), int32(1)
memory usage: 4.8 MB
None


In [5]:
data = {
    'A': np.random.randint(1, 100, size=1000000),
    'B': np.random.randint(1, 100, size=1000000)
}
df = pd.DataFrame(data)

# Perform element-wise multiplication using a for loop
start_time = time.time()  # Record the start time
result_for_loop = []
for index, row in df.iterrows():
    result_for_loop.append(row['A'] * row['B'])
result_for_loop = pd.Series(result_for_loop)
time_for_loop = time.time() - start_time  # Calculate the time taken

# Perform element-wise multiplication using the * operator
start_time = time.time()  # Record the start time
result_vectorized = df['A'] * df['B']
time_vectorized = time.time() - start_time  # Calculate the time taken

# Print the time taken for both methods
print("Time taken using for loop:", time_for_loop, "seconds")
print("Time taken using * operator:", time_vectorized, "seconds")

Time taken using for loop: 24.76099681854248 seconds
Time taken using * operator: 0.0 seconds


In [6]:
# Create a sample DataFrame
np.random.seed(0)  # Set seed for reproducibility
data = {
    'A': np.random.randint(1, 100, size=1000000),
    'B': np.random.randint(1, 100, size=1000000),
    'C': np.random.randint(1, 100, size=1000000),
    'D': np.random.randint(1, 100, size=1000000)
}
df = pd.DataFrame(data)

# Perform arithmetic operations using standard operations
start_time = time.time()  # Record the start time
df['Result_standard'] = df['A'] + df['B'] - df['C'] * df['D'] / df['A']
time_standard = time.time() - start_time  # Calculate the time taken

# Perform arithmetic operations using the eval method
start_time = time.time()  # Record the start time
df['Result_eval'] = df.eval('A + B - C * D / A')
time_eval = time.time() - start_time  # Calculate the time taken

# Print the time taken for both methods
print("Time taken using standard operations:", time_standard, "seconds")
print("Time taken using eval method:", time_eval, "seconds")

Time taken using standard operations: 0.028382062911987305 seconds
Time taken using eval method: 0.13286519050598145 seconds
