In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# --- 1. Prepare Sample Data ---
# Let's create a slightly richer DataFrame for diverse examples
data = {
    'Region': ['North', 'West', 'East', 'South', 'West', 'North', 'East'],
    'Product': ['A', 'B', 'A', 'C', 'B', 'C', 'A'],
    'Sales': [250, 180, 310, 150, 210, 190, 280],
    'Quantity': [10, 8, 12, 7, 9, 8, 11],
    'Feedback': ['Good', 'Okay', 'Excellent', 'Good', 'Good', np.nan, 'Okay']
}
index_labels = pd.date_range('2024-01-01', periods=7, freq='D') # Use a DatetimeIndex
df = pd.DataFrame(data, index=index_labels)
df.index.name = 'Date' # Name the index

print("--- Sample DataFrame ---")
print(df)
print("-" * 30)


# --- 2. Viewing/Inspecting Data ---
print("--- Viewing/Inspecting Data ---")

# .head(n): First n rows (default 5)
print("First 3 rows (.head(3)):\n", df.head(3))

# .tail(n): Last n rows (default 5)
print("\nLast 2 rows (.tail(2)):\n", df.tail(2))

# .sample(n): Get a random sample of n rows
print("\nRandom sample of 2 rows (.sample(2)):\n", df.sample(2))

# .info(): Concise summary (index, columns, dtypes, non-null counts, memory)
print("\nDataFrame Info (.info()):")
df.info()

# .describe(): Summary statistics for numerical columns
print("\nNumerical Description (.describe()):\n", df.describe())

# .describe(include='object'): Summary statistics for object/string columns
print("\nObject Description (.describe(include='object')):\n", df.describe(include='object'))

# .describe(include='all'): Summary for all columns
print("\nAll Columns Description (.describe(include='all')):\n", df.describe(include='all'))

# Attributes for basic info
print(f"\nShape (rows, cols): {df.shape}")
print(f"Data Types (dtypes):\n{df.dtypes}")
print(f"Index: {df.index}")
print(f"Columns: {df.columns}")
print(f"Values (as NumPy array):\n{df.values}")
print("-" * 30)


# --- 3. Summarizing Data ---
print("--- Summarizing Data ---")

# .nunique(): Count distinct values in each column
print("Number of unique values per column (.nunique()):\n", df.nunique())

# .value_counts(): Get counts of unique values in a Series (column)
print("\nValue counts for 'Region' column (.value_counts()):\n", df['Region'].value_counts())
print("\nValue counts for 'Feedback' (handles NaN by default):\n", df['Feedback'].value_counts(dropna=False)) # dropna=False includes NaN count
print("-" * 30)


# --- 4. Selection: Columns ---
print("--- Column Selection ---")

# Select a single column (returns a Series)
sales_col = df['Sales']
print("Selecting 'Sales' column (returns Series):\n", sales_col)
print(f"Type: {type(sales_col)}")

# Dot notation (works if column name is a valid Python identifier, no spaces, etc.) - Use with caution
region_col_dot = df.Region # Equivalent to df['Region']
print("\nSelecting 'Region' using dot notation:\n", region_col_dot)

# Select multiple columns (returns a DataFrame)
# Pass a list of column names
subset_cols = df[['Product', 'Sales', 'Quantity']]
print("\nSelecting ['Product', 'Sales', 'Quantity'] columns (returns DataFrame):\n", subset_cols)
print(f"Type: {type(subset_cols)}")
print("-" * 30)


# --- 5. Selection: Rows and Columns with .loc[] (Label-based) ---
# Selects data based on index labels and column names.

print("--- Selection with .loc[] (Label-based) ---")

# Select row(s) by index label
row_jan3 = df.loc['2024-01-03'] # Single label -> returns a Series
print(f"Selecting row with label '2024-01-03' (returns Series):\n{row_jan3}")

rows_jan2_jan4 = df.loc[['2024-01-02', '2024-01-04']] # List of labels -> returns DataFrame
print(f"\nSelecting rows ['2024-01-02', '2024-01-04'] (returns DataFrame):\n{rows_jan2_jan4}")

# Select rows using label slicing (inclusive of start and end labels)
rows_slice = df.loc['2024-01-02':'2024-01-05']
print(f"\nSelecting rows from '2024-01-02' to '2024-01-05' (slice):\n{rows_slice}")

# Select rows and specific columns by label
# df.loc[row_indexer, column_indexer]
sales_quant_jan5 = df.loc['2024-01-05', ['Sales', 'Quantity']] # Single row, multiple columns -> Series
print(f"\nSelecting 'Sales', 'Quantity' for row '2024-01-05':\n{sales_quant_jan5}")

subset_loc = df.loc['2024-01-02':'2024-01-04', ['Region', 'Sales']] # Slice rows, list columns -> DataFrame
print(f"\nSelecting 'Region', 'Sales' for rows '2024-01-02' to '2024-01-04':\n{subset_loc}")

# Select a single value
value_loc = df.loc['2024-01-06', 'Product']
print(f"\nSelecting single value at ('2024-01-06', 'Product'): {value_loc}")

# Select based on boolean condition using .loc
# Get all rows where Region is 'East'
east_rows_loc = df.loc[df['Region'] == 'East']
print("\nSelecting rows where Region is 'East' using .loc:\n", east_rows_loc)

# Get 'Sales' and 'Feedback' for rows where Quantity > 8
high_quant_loc = df.loc[df['Quantity'] > 8, ['Sales', 'Feedback']]
print("\nSelecting 'Sales', 'Feedback' where Quantity > 8 using .loc:\n", high_quant_loc)
print("-" * 30)


# --- 6. Selection: Rows and Columns with .iloc[] (Position-based) ---
# Selects data based on integer positions (0-based index).

print("--- Selection with .iloc[] (Position-based) ---")

# Select row(s) by integer position
row_pos1 = df.iloc[1] # Second row (position 1) -> returns a Series
print(f"Selecting row at position 1 (returns Series):\n{row_pos1}")

rows_pos0_3 = df.iloc[[0, 3]] # Rows at positions 0 and 3 -> returns DataFrame
print(f"\nSelecting rows at positions [0, 3] (returns DataFrame):\n{rows_pos0_3}")

# Select rows using integer slicing (exclusive of end position)
rows_slice_pos = df.iloc[1:4] # Rows from position 1 up to (not including) 4
print(f"\nSelecting rows from position 1 to 4 (slice):\n{rows_slice_pos}")

# Select rows and specific columns by position
# df.iloc[row_indexer, column_indexer]
cols_pos1_3_row0 = df.iloc[0, [1, 3]] # Row 0, columns 1 ('Product') and 3 ('Quantity') -> Series
print(f"\nSelecting columns 1, 3 for row 0:\n{cols_pos1_3_row0}")

subset_iloc = df.iloc[2:5, 0:3] # Rows 2-4, Columns 0-2 -> DataFrame
print(f"\nSelecting rows 2-4, columns 0-2:\n{subset_iloc}")

# Select a single value by position
value_iloc = df.iloc[4, 2] # Row 4, Column 2 ('Sales')
print(f"\nSelecting single value at position (4, 2): {value_iloc}")
print("-" * 30)


# --- 7. Selection: Conditional (Boolean Indexing) ---
# Directly using boolean conditions inside square brackets [].
# Often more concise than using .loc with a boolean condition for selecting rows.

print("--- Conditional Selection (Boolean Indexing) ---")

# Get all rows where Region is 'North'
north_rows = df[df['Region'] == 'North']
print("Selecting rows where Region is 'North':\n", north_rows)

# Combine multiple conditions: & (AND), | (OR), ~ (NOT)
# Parentheses are required due to operator precedence!
west_high_sales = df[(df['Region'] == 'West') & (df['Sales'] > 200)]
print("\nSelecting rows where Region is 'West' AND Sales > 200:\n", west_high_sales)

not_A_product = df[df['Product'] != 'A'] # Or use ~(df['Product'] == 'A')
print("\nSelecting rows where Product is NOT 'A':\n", not_A_product)

# Using .isin() for multiple possible values
region_filter = ['North', 'South']
north_south_rows = df[df['Region'].isin(region_filter)]
print(f"\nSelecting rows where Region is in {region_filter}:\n", north_south_rows)

# Using .between() for values within a range (inclusive)
sales_range = df[df['Sales'].between(180, 250)] # Sales >= 180 AND Sales <= 250
print("\nSelecting rows where Sales is between 180 and 250:\n", sales_range)
print("-" * 30)


# --- 8. Setting Values ---
# Use .loc or .iloc on the left side of an assignment.

print("--- Setting Values ---")
# Create a copy to avoid modifying the original df used in later examples
df_copy = df.copy()

# Set a single value using .loc
df_copy.loc['2024-01-01', 'Sales'] = 260
print("DataFrame after setting single value with .loc:\n", df_copy.head(1))

# Set a single value using .iloc
df_copy.iloc[1, 3] = 9 # Set Quantity for the second row (index 1) to 9
print("\nDataFrame after setting single value with .iloc:\n", df_copy.head(2))

# Set an entire column
df_copy['NewColumn'] = df_copy['Sales'] / df_copy['Quantity']
print("\nDataFrame after adding a new column:\n", df_copy.head(3))

# Set values for a slice using .loc
df_copy.loc['2024-01-04':'2024-01-06', 'Feedback'] = 'Improved'
print("\nDataFrame after setting slice with .loc:\n", df_copy)

# Set values using boolean indexing
df_copy.loc[df_copy['Region'] == 'East', 'Sales'] = df_copy['Sales'] * 1.1 # Increase East sales by 10%
print("\nDataFrame after setting based on condition:\n", df_copy)
print("-" * 30)


# --- 9. Index Manipulation ---
print("--- Index Manipulation ---")

# .set_index(): Set one or more columns as the DataFrame index
# Returns a new DataFrame by default (use inplace=True to modify original)
df_indexed_by_region = df.set_index('Region')
print("DataFrame with 'Region' as index (.set_index('Region')):\n", df_indexed_by_region.head())
# Note: Original index 'Date' is dropped by default (use drop=False to keep it as a column)

df_indexed_multi = df.set_index(['Region', 'Product']) # MultiIndex (Hierarchical)
print("\nDataFrame with MultiIndex ('Region', 'Product'):\n", df_indexed_multi.head())

# .reset_index(): Reset the index, moving index levels into columns
# Returns a new DataFrame by default
df_reset = df_indexed_multi.reset_index()
print("\nDataFrame after resetting MultiIndex (.reset_index()):\n", df_reset.head())

# Resetting the original DataFrame's DatetimeIndex
df_reset_original = df.reset_index() # Moves 'Date' index into a column
print("\nOriginal DataFrame after resetting index:\n", df_reset_original.head())
print("-" * 30)



--- Sample DataFrame ---
           Region Product  Sales  Quantity   Feedback
Date                                                 
2024-01-01  North       A    250        10       Good
2024-01-02   West       B    180         8       Okay
2024-01-03   East       A    310        12  Excellent
2024-01-04  South       C    150         7       Good
2024-01-05   West       B    210         9       Good
2024-01-06  North       C    190         8        NaN
2024-01-07   East       A    280        11       Okay
------------------------------
--- Viewing/Inspecting Data ---
First 3 rows (.head(3)):
            Region Product  Sales  Quantity   Feedback
Date                                                 
2024-01-01  North       A    250        10       Good
2024-01-02   West       B    180         8       Okay
2024-01-03   East       A    310        12  Excellent

Last 2 rows (.tail(2)):
            Region Product  Sales  Quantity Feedback
Date                                               
