# Q29: Optimize Memory Usage for Large Dataset

- Work with a large CSV/JSON dataset (>100MB).
- Optimize memory usage using appropriate data types, chunk loading, and indexing.
- Document the performance improvements achieved.

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Simulate a large dataset (e.g., 1 million rows)
n_rows = 4_000_000
df_large = pd.DataFrame({
    'id': np.arange(n_rows),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
    'value': np.random.rand(n_rows),
    'flag': np.random.choice([0, 1], n_rows)
})

In [3]:
df_large.to_csv('large_data.csv', index=False)

In [4]:
# Check file size
print('CSV file size (MB):', os.path.getsize('large_data.csv') / (1024*1024))

CSV file size (MB): 118.22431755065918


In [5]:
# --- Chunk Loading ---
chunk_iter = pd.read_csv('large_data.csv', chunksize=200_000)
df_chunks = [chunk for chunk in chunk_iter]
df_loaded = pd.concat(df_chunks, ignore_index=True)
print('Loaded with chunking:', df_loaded.shape)

Loaded with chunking: (4000000, 4)


In [6]:
# --- Data Type Optimization ---
df_loaded['category'] = df_loaded['category'].astype('category')
df_loaded['flag'] = df_loaded['flag'].astype('bool')
print('Optimized dtypes:', df_loaded.dtypes)

Optimized dtypes: id             int64
category    category
value        float64
flag            bool
dtype: object


In [7]:
# --- Indexing ---
df_loaded.set_index('id', inplace=True)
print('Indexed DataFrame head:')
print(df_loaded.head())

Indexed DataFrame head:
   category     value   flag
id                          
0         D  0.825876  False
1         D  0.813108   True
2         C  0.722714  False
3         C  0.160238  False
4         A  0.688820   True


In [8]:
# --- Memory Usage Comparison ---
mem_before = df_large.memory_usage(deep=True).sum() / (1024*1024)
mem_after = df_loaded.memory_usage(deep=True).sum() / (1024*1024)
print(f'Memory usage before optimization: {mem_before:.2f} MB')
print(f'Memory usage after optimization: {mem_after:.2f} MB')
print('Memory saved:', mem_before - mem_after, 'MB')

Memory usage before optimization: 282.29 MB
Memory usage after optimization: 68.66 MB
Memory saved: 213.62281799316406 MB
