# üß© Notebook 10: Diagnose and optimize performance in pandas pipelines.

In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

In [2]:
# Import Statements
import pandas as pd
import numpy as np
from scripts import utils_io
from scripts.optimize_memory import optimize_dataframe

## üìÇ Load Data

In [3]:
superstore = utils_io.load_csv("../assets/superstore_final.csv")
loan = utils_io.load_csv("../assets/loan_final_all_regions.csv")

## # üîç Before Optimization: Memory Usage

In [4]:
print("Before Optimization ‚Äì Superstore:")
superstore.info(memory_usage="deep")
print("\nBefore Optimization ‚Äì Loan:")
loan.info(memory_usage="deep")

Before Optimization ‚Äì Superstore:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       10000 non-null  object 
 1   customer_id    10000 non-null  object 
 2   customer_name  10000 non-null  object 
 3   segment        10000 non-null  object 
 4   region         10000 non-null  object 
 5   order_date     10000 non-null  object 
 6   ship_date      10000 non-null  object 
 7   category       10000 non-null  object 
 8   sub_category   10000 non-null  object 
 9   product_name   10000 non-null  object 
 10  sales          10000 non-null  float64
 11  quantity       10000 non-null  int64  
 12  discount       10000 non-null  float64
 13  profit         10000 non-null  float64
dtypes: float64(3), int64(1), object(10)
memory usage: 6.7 MB

Before Optimization ‚Äì Loan:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 ent

## üß† Optimize Memory Usage

In [5]:
superstore = optimize_dataframe(
    superstore,
    category_cols=["Segment", "Region", "Category", "Sub-Category", "Ship Mode"]
)

loan = optimize_dataframe(
    loan,
    category_cols=["Loan Type", "Customer Gender"]
)

# üìà After Optimization: Memory Usage
print("\nAfter Optimization ‚Äì Superstore:")
superstore.info(memory_usage="deep")
print("\nAfter Optimization ‚Äì Loan:")
loan.info(memory_usage="deep")

üì¶ Memory usage BEFORE optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       10000 non-null  object 
 1   customer_id    10000 non-null  object 
 2   customer_name  10000 non-null  object 
 3   segment        10000 non-null  object 
 4   region         10000 non-null  object 
 5   order_date     10000 non-null  object 
 6   ship_date      10000 non-null  object 
 7   category       10000 non-null  object 
 8   sub_category   10000 non-null  object 
 9   product_name   10000 non-null  object 
 10  sales          10000 non-null  float64
 11  quantity       10000 non-null  int64  
 12  discount       10000 non-null  float64
 13  profit         10000 non-null  float64
dtypes: float64(3), int64(1), object(10)
memory usage: 6.7 MB

‚úÖ Memory usage AFTER optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex:

## üßÆ Use query() and eval() for performance

In [6]:
# üîπ query(): Efficient row filtering
high_sales = superstore.query("sales > 500")
print(f"\nüîπ Rows with Sales > 500: {len(high_sales)}")

# üîπ eval(): Faster expression evaluation
loan.eval("loan_to_income_ratio = loan_amount / income", inplace=True)
print("\nüîπ LoanToIncomeRatio head:")
print(loan[["loan_amount", "income", "loan_to_income_ratio"]].head())


üîπ Rows with Sales > 500: 7528

üîπ LoanToIncomeRatio head:
   loan_amount  income  loan_to_income_ratio
0        36890   96964              0.380450
1        56517   59846              0.944374
2        42857   43879              0.976709
3        51149   27261              1.876270
4        17337   53340              0.325028


## üêç Demonstrate Dask

In [7]:
try:
    import dask.dataframe as dd

    # Convert CSV to Dask DataFrame
    ddf = dd.read_csv("../assets/superstore_final.csv")
    print("\n‚úÖ Dask DFrame Loaded. Preview:")
    print(ddf.head())

    # Groupby using Dask
    result = ddf.groupby("region")["profit"].mean().compute()
    print("\n‚úÖ Dask Aggregation Result:")
    print(result)

except ImportError:
    print("\n‚ö†Ô∏è Dask not installed. Run: pip install dask[complete]")

# ------------------------------------------------
# üì§ Export Final Optimized Data (Optional)
# ------------------------------------------------
utils_io.export_csv(superstore, "../exports/superstore_optimized.csv")
utils_io.export_csv(loan, "../exports/loan_optimized.csv")


‚úÖ Dask DFrame Loaded. Preview:
    order_id customer_id      customer_name      segment   region  order_date  \
0  ord-10000   cust-9476  mr. michael lopez  home office  central  2020-01-01   
1  ord-10001   cust-9162         robert liu  home office    south  2020-01-02   
2  ord-10002   cust-3824      nicole bowman     consumer    south  2020-01-03   
3  ord-10003   cust-8888     stephen flores     consumer  central  2020-01-04   
4  ord-10004   cust-9980  stephen rodriguez    corporate    south  2020-01-05   

    ship_date   category sub_category       product_name    sales  quantity  \
0  2020-01-03  furniture    bookcases  bookcases model 1  1292.63         5   
1  2020-01-04  furniture    bookcases  bookcases model 2  1947.16         2   
2  2020-01-05  furniture    bookcases  bookcases model 3  1774.42         3   
3  2020-01-06  furniture    bookcases  bookcases model 4   591.01         8   
4  2020-01-07  furniture    bookcases  bookcases model 5  1969.55         4   

   d

## üìò Summary

In this notebook, we diagnosed and improved the performance of our final merged dataset by:

- Measuring memory usage with `.info(memory_usage="deep")`
- Reducing memory footprint using `category` and numeric downcasting
- Using `.query()` and `.eval()` for readable, efficient filtering
- Timing operations using `%timeit` and `%memit` (in Jupyter)
- Briefly exploring `Dask` as a scalable alternative for large datasets

üîÅ This optimized dataset (`optimized_pipeline.csv`) is now ready for fast downstream analysis or deployment.