# Exercise 05: Pandas Optimizations

In [None]:
import pandas as pd
import numpy as np
import time
import gc

In [None]:
# 1. Load data
df = pd.read_csv('../ex04/fines.csv')
df.head()

In [None]:
# 2. Iterations
# Calculate fines/refund*year

# Loop
start = time.time()
res = []
for i in range(0, len(df)):
    # illoc access
    row = df.iloc[i]
    res.append(row['Fines'] / row['Refund'] * row['Year'])
df['calculated'] = res
end = time.time()
print(f"Loop time: {end - start:.4f} seconds")

# iterrows
start = time.time()
res = []
for index, row in df.iterrows():
    res.append(row['Fines'] / row['Refund'] * row['Year'])
df['calculated'] = res
end = time.time()
print(f"Iterrows time: {end - start:.4f} seconds")

# apply
start = time.time()
df['calculated'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)
end = time.time()
print(f"Apply time: {end - start:.4f} seconds")

# Series
start = time.time()
df['calculated'] = df['Fines'] / df['Refund'] * df['Year']
end = time.time()
print(f"Series time: {end - start:.4f} seconds")

# Values
start = time.time()
df['calculated'] = df['Fines'].values / df['Refund'].values * df['Year'].values
end = time.time()
print(f"Values time: {end - start:.4f} seconds")

In [None]:
# 3. Indexing
# Get a row for specific CarNumber
target_car = df['CarNumber'].iloc[0]

start = time.time()
df[df['CarNumber'] == target_car]
end = time.time()
print(f"Indexing (no index) time: {end - start:.4f} seconds")

df_indexed = df.set_index('CarNumber')

start = time.time()
df_indexed.loc[target_car]
end = time.time()
print(f"Indexing (with index) time: {end - start:.4f} seconds")

In [None]:
# 4. Downcasting
print("Before optimization:")
df.info(memory_usage='deep')

optimized_df = df.copy()

# Float downcast
cols_float = optimized_df.select_dtypes(include=['float']).columns
optimized_df[cols_float] = optimized_df[cols_float].apply(pd.to_numeric, downcast='float')

# Int downcast
cols_int = optimized_df.select_dtypes(include=['int']).columns
optimized_df[cols_int] = optimized_df[cols_int].apply(pd.to_numeric, downcast='integer')

print("\nAfter downcasting:")
optimized_df.info(memory_usage='deep')

In [None]:
# 5. Categories
cols_obj = optimized_df.select_dtypes(include=['object']).columns
for col in cols_obj:
    optimized_df[col] = optimized_df[col].astype('category')

print("\nAfter categories:")
optimized_df.info(memory_usage='deep')

In [None]:
# 6. Memory clean
del df
del optimized_df
gc.collect()
print("Memory cleaned.")