In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np
import time # For performance comparison examples

# --- 1. MultiIndex (Hierarchical Indexing) ---
# Allows having multiple index levels on an axis.
# Useful for representing higher-dimensional data in a 1D (Series) or 2D (DataFrame) structure.

print("--- MultiIndex (Hierarchical Indexing) ---")

# a) Creating a MultiIndex
arrays = [
    ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]
multi_index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
print("MultiIndex created from arrays:\n", multi_index)

# Create a Series with a MultiIndex
s_multi = pd.Series(np.random.randn(8), index=multi_index)
print("\nSeries with MultiIndex:\n", s_multi)

# Create a DataFrame with MultiIndex on rows and/or columns
df_multi_row = pd.DataFrame(np.random.randn(8, 3), index=multi_index, columns=['A', 'B', 'C'])
print("\nDataFrame with MultiIndex on rows:\n", df_multi_row)

arrays_cols = [
    ['Stat1', 'Stat1', 'Stat2', 'Stat2'],
    ['Mean', 'Std', 'Mean', 'Std']
]
multi_cols = pd.MultiIndex.from_arrays(arrays_cols, names=['Metric', 'Type'])
df_multi_col = pd.DataFrame(np.random.randn(4, 4), index=['R1', 'R2', 'R3', 'R4'], columns=multi_cols)
print("\nDataFrame with MultiIndex on columns:\n", df_multi_col)

# b) Selecting and Slicing with MultiIndex
print("\n--- Selecting with MultiIndex ---")
print("Using df_multi_row:\n", df_multi_row)

# Select based on the outer level ('first')
print("\nSelect outer level 'bar':\n", df_multi_row.loc['bar'])

# Select based on both levels (tuple)
print("\nSelect ('baz', 'two'):\n", df_multi_row.loc[('baz', 'two')]) # Returns a Series

# Select slice on outer level
print("\nSelect slice 'baz' to 'foo':\n", df_multi_row.loc['baz':'foo'])

# Select specific inner level across outer levels using slice(None)
# Get all 'one' entries
print("\nSelect all 'one' from inner level:\n", df_multi_row.loc[(slice(None), 'one'), :])

# Selecting with MultiIndex columns
print("\nUsing df_multi_col:\n", df_multi_col)
print("\nSelect outer column 'Stat1':\n", df_multi_col['Stat1'])
print("\nSelect inner column ('Stat1', 'Std'):\n", df_multi_col[('Stat1', 'Std')])

# c) Stacking and Unstacking (revisited) - Key operations for MultiIndex
# .unstack() moves an index level (default: innermost) to columns
# .stack() moves columns to become the innermost index level
print("\nUnstacking inner row level ('second') of df_multi_row:\n", df_multi_row.unstack(level='second'))
print("\nStacking columns of df_multi_row:\n", df_multi_row.stack())

# d) Setting and Resetting MultiIndex
# Use set_index with a list of columns, reset_index works as usual
df_flat = df_multi_row.reset_index()
print("\nResetting MultiIndex:\n", df_flat)
df_reindexed = df_flat.set_index(['first', 'second'])
print("\nSetting MultiIndex again:\n", df_reindexed)
print("-" * 30)


# --- 2. Performance Optimization ---

print("--- Performance Optimization ---")

# a) Use Efficient Data Types
# - Convert object columns with few unique string values to 'category'.
# - Use smaller integer types (int8, int16, int32) or float types (float32)
#   if the full range/precision of int64/float64 is not needed.
df_mem = pd.DataFrame({
    'ID': np.arange(1000000),
    'Category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1000000),
    'Value': np.random.rand(1000000)
})
print("Original memory usage:")
df_mem.info(memory_usage='deep')

df_mem['Category'] = df_mem['Category'].astype('category')
df_mem['Value'] = df_mem['Value'].astype(np.float32)
df_mem['ID'] = df_mem['ID'].astype(np.int32) # Assuming ID fits in int32

print("\nMemory usage after dtype conversion:")
df_mem.info(memory_usage='deep') # Significant reduction expected
print("-" * 20)

# b) Vectorization (Avoid Loops)
# Use built-in Pandas/NumPy functions which operate on entire arrays/Series.
# Avoid iterating row by row using loops or df.iterrows(), df.itertuples() if possible.
# Avoid df.apply() with simple arithmetic/logical operations - direct vectorized ops are faster.

# Example: Calculate Value * 2
n = 100000
df_perf = pd.DataFrame({'Value': np.random.randn(n)})

start_time = time.time()
result_vectorized = df_perf['Value'] * 2
vectorized_time = time.time() - start_time

start_time = time.time()
result_apply = df_perf['Value'].apply(lambda x: x * 2)
apply_time = time.time() - start_time

# Iterrows is usually the slowest
# start_time = time.time()
# result_iterrows = []
# for index, row in df_perf.iterrows():
#     result_iterrows.append(row['Value'] * 2)
# result_iterrows = pd.Series(result_iterrows)
# iterrows_time = time.time() - start_time

print(f"\nVectorized operation time: {vectorized_time:.6f} seconds")
print(f".apply() operation time: {apply_time:.6f} seconds")
# print(f".iterrows() operation time: {iterrows_time:.6f} seconds") # Expected to be much slower
print("Vectorization is significantly faster than .apply() or iteration.")
print("-" * 20)

# c) Using .eval() and .query() (for large DataFrames)
# Can sometimes speed up complex arithmetic or boolean expressions by using numexpr library.
df_eval = pd.DataFrame(np.random.randn(100000, 4), columns=['A', 'B', 'C', 'D'])

# Standard boolean indexing
start_time = time.time()
result_bool = df_eval[(df_eval['A'] > 0) & (df_eval['B'] < 0)]
bool_time = time.time() - start_time

# Using .query()
start_time = time.time()
result_query = df_eval.query('A > 0 and B < 0') # Uses string expression
query_time = time.time() - start_time

# Standard arithmetic
start_time = time.time()
result_arith = (df_eval['A'] + df_eval['B']) / (df_eval['C'] - df_eval['D'])
arith_time = time.time() - start_time

# Using .eval()
start_time = time.time()
result_eval = pd.eval("(df_eval.A + df_eval.B) / (df_eval.C - df_eval.D)") # Uses string expression
eval_time = time.time() - start_time

print(f"\nBoolean Indexing time: {bool_time:.6f}")
print(f".query() time: {query_time:.6f}") # Often faster on large DFs
print(f"\nStandard Arithmetic time: {arith_time:.6f}")
print(f"pd.eval() time: {eval_time:.6f}") # Often faster on large DFs
# Note: Performance gains depend on expression complexity and DataFrame size. Requires 'numexpr' library.
print("-" * 20)

# d) Reading Large Files in Chunks
# Use `chunksize` parameter in `pd.read_csv` (or other readers) to process large files piece by piece.
# filename = 'very_large_file.csv'
# chunk_list = []
# chunk_iter = pd.read_csv(filename, chunksize=10000) # Process 10000 rows at a time
# for chunk in chunk_iter:
#     # Process each chunk (e.g., filter, aggregate)
#     processed_chunk = chunk[chunk['Value'] > 0].groupby('Category').size()
#     chunk_list.append(processed_chunk)
# final_result = pd.concat(chunk_list).sum(level=0) # Combine results
print("Using chunksize in pd.read_csv helps process files too large for memory.")
print("-" * 20)

# e) Choosing Efficient File Formats
# For intermediate storage or faster I/O, consider binary formats:
# - Parquet (.parquet): Efficient columnar storage, good compression. Requires 'pyarrow' or 'fastparquet'.
# - Feather (.feather): Fast, lightweight binary format for DataFrame exchange (especially between Python and R). Requires 'pyarrow'.
# - HDF5 (.h5): Hierarchical format, good for storing multiple datasets and metadata. Requires 'tables'.
# df_mem.to_parquet('efficient_data.parquet')
# df_read_parquet = pd.read_parquet('efficient_data.parquet')
print("Parquet, Feather, HDF5 offer more efficient storage/retrieval than CSV.")
print("-" * 30)


# --- 3. Options & Settings ---
# Customize Pandas behavior using pd.set_option() or pd.options.

print("--- Options & Settings ---")
# Display options (e.g., max rows/columns shown)
print(f"Max rows displayed: {pd.options.display.max_rows}")
print(f"Max columns displayed: {pd.options.display.max_columns}")

# Set max rows to 10 for display
# pd.set_option('display.max_rows', 10)
# print("\nDataFrame display after setting max_rows=10:\n", df_mem.head(20)) # Only 10 rows would show (...)

# Reset to default
# pd.reset_option('display.max_rows')

# Control float precision display
print(f"\nFloat precision display: {pd.options.display.precision}")
# pd.set_option('display.precision', 3)
# print("\nDataFrame display after setting precision=3:\n", pd.DataFrame(np.random.randn(3,3)))
# pd.reset_option('display.precision')

# Other options exist for plotting, computation, etc.
# pd.describe_option() # See all options
# pd.describe_option('display') # See display options
print("Use pd.set_option() to control display formats, computation behavior, etc.")
print("-" * 30)


# --- 4. Extending Pandas (Brief Overview) ---
# Advanced users can extend Pandas functionality:
# - Custom Accessors: Add custom namespaces to Series/DataFrame objects (like .str, .cat, .dt). Requires defining a class decorated with @pd.api.extensions.register_dataframe_accessor (or series/index).
# - Custom Extension Types/Arrays: Define custom data types beyond NumPy's defaults (more complex).
print("--- Extending Pandas (Conceptual) ---")
print("Advanced users can add custom functionality via accessors or extension types.")
print("-" * 30)


# --- 5. Integration with Other Libraries ---
# Pandas DataFrames are the standard input/output for many data science libraries:
# - Scikit-learn: Pass DataFrames directly for model training (select features/target).
# - Statsmodels: Statistical modeling library that works well with DataFrames.
# - Matplotlib/Seaborn: Visualization libraries that accept DataFrames/Series.
# - NumPy: Pandas is built on NumPy; easily convert using .values or passing DF/Series to NumPy functions.
print("--- Integration with Other Libraries ---")
print("Pandas DataFrames integrate seamlessly with Scikit-learn, Statsmodels, Matplotlib, Seaborn, NumPy, etc.")
print("-" * 30)

--- MultiIndex (Hierarchical Indexing) ---
MultiIndex created from arrays:
 MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

Series with MultiIndex:
 first  second
bar    one      -0.459839
       two       0.422791
baz    one       0.951586
       two       1.733919
foo    one       1.415242
       two       0.202581
qux    one       0.394281
       two       1.025072
dtype: float64

DataFrame with MultiIndex on rows:
                      A         B         C
first second                              
bar   one     1.005384 -0.900951 -0.599336
      two    -0.820504  0.080305  0.257593
baz   one    -1.657937  0.599923 -0.934853
      two    -0.069383  0.655565  1.173372
foo   one    -1.230530  0.845230  0.500707
      two     1.177004  0.052342 -0.558967
qux   one     0.4