In [1]:
import pandas as pd
import numpy as np

#### 1. Prepare Sample Data ---
- Create a DataFrame with missing values, duplicates, and mixed types

In [5]:
data = {
    'OrderID': [101, 102, 103, 104, 101, 105, 106, 107], # Duplicate OrderID 101
    'Product': ['A', 'B', 'A', 'C', 'A', 'B', 'C', 'D'],
    'Category': ['X', 'Y', 'X', 'X', 'X', 'Y', 'Z', 'Y'],
    'UnitPrice': [10.5, 25.0, 10.5, 5.25, 10.5, 25.0, np.nan, 8.0], # Missing UnitPrice
    'Quantity': [5, 2, np.nan, 10, 5, 3, 7, 4], # Missing Quantity
    'Status': ['Shipped', 'Pending', 'Shipped', 'Shipped', 'Shipped', 'Pending', 'Cancelled', 'Shipped'],
    'CustomerID': ['C100', 'C200', 'C100', 'C300', 'C100', 'C400', 'C200', 'C500']
}
df = pd.DataFrame(data)
print(df)

   OrderID Product Category  UnitPrice  Quantity     Status CustomerID
0      101       A        X      10.50       5.0    Shipped       C100
1      102       B        Y      25.00       2.0    Pending       C200
2      103       A        X      10.50       NaN    Shipped       C100
3      104       C        X       5.25      10.0    Shipped       C300
4      101       A        X      10.50       5.0    Shipped       C100
5      105       B        Y      25.00       3.0    Pending       C400
6      106       C        Z        NaN       7.0  Cancelled       C200
7      107       D        Y       8.00       4.0    Shipped       C500


In [3]:
# Add a fully duplicate row
duplicate_row = pd.DataFrame([df.iloc[0].to_dict()]) # Get first row as dict, make DataFrame
df = pd.concat([df, duplicate_row], ignore_index=True)

In [4]:
print("--- Original Messy DataFrame ---")
print(df)
print("-" * 30)
df.info() # Show initial dtypes and non-null counts

--- Original Messy DataFrame ---
   OrderID Product Category  UnitPrice  Quantity     Status CustomerID
0      101       A        X      10.50       5.0    Shipped       C100
1      102       B        Y      25.00       2.0    Pending       C200
2      103       A        X      10.50       NaN    Shipped       C100
3      104       C        X       5.25      10.0    Shipped       C300
4      101       A        X      10.50       5.0    Shipped       C100
5      105       B        Y      25.00       3.0    Pending       C400
6      106       C        Z        NaN       7.0  Cancelled       C200
7      107       D        Y       8.00       4.0    Shipped       C500
8      101       A        X      10.50       5.0    Shipped       C100
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   OrderID     9 non-null      int64  
 1 

#### 2. Handling Missing Data (NaN)

In [6]:
# a) Identifying missing data
print("Check for missing values (.isnull()):\n", df.isnull())
print("\nCheck for non-missing values (.notnull()):\n", df.notnull())

Check for missing values (.isnull()):
    OrderID  Product  Category  UnitPrice  Quantity  Status  CustomerID
0    False    False     False      False     False   False       False
1    False    False     False      False     False   False       False
2    False    False     False      False      True   False       False
3    False    False     False      False     False   False       False
4    False    False     False      False     False   False       False
5    False    False     False      False     False   False       False
6    False    False     False       True     False   False       False
7    False    False     False      False     False   False       False

Check for non-missing values (.notnull()):
    OrderID  Product  Category  UnitPrice  Quantity  Status  CustomerID
0     True     True      True       True      True    True        True
1     True     True      True       True      True    True        True
2     True     True      True       True     False    True      

In [9]:
# Count missing values per column
print("\nCount of missing values per column (.isnull().sum()):\n", df.isnull().sum())


Count of missing values per column (.isnull().sum()):
 OrderID       0
Product       0
Category      0
UnitPrice     1
Quantity      1
Status        0
CustomerID    0
dtype: int64


In [10]:
# b) Dropping missing data (.dropna())
# Drop rows containing *any* missing values

df_dropped_rows = df.dropna() # Default axis=0 (rows), how='any'
print("DataFrame after dropping rows with any NaN (.dropna()):\n", df_dropped_rows)

DataFrame after dropping rows with any NaN (.dropna()):
    OrderID Product Category  UnitPrice  Quantity   Status CustomerID
0      101       A        X      10.50       5.0  Shipped       C100
1      102       B        Y      25.00       2.0  Pending       C200
3      104       C        X       5.25      10.0  Shipped       C300
4      101       A        X      10.50       5.0  Shipped       C100
5      105       B        Y      25.00       3.0  Pending       C400
7      107       D        Y       8.00       4.0  Shipped       C500


In [11]:
# Drop columns containing *any* missing values
df_dropped_cols = df.dropna(axis=1) # axis='columns' or 1
print("\nDataFrame after dropping columns with any NaN (.dropna(axis=1)):\n", df_dropped_cols)


DataFrame after dropping columns with any NaN (.dropna(axis=1)):
    OrderID Product Category     Status CustomerID
0      101       A        X    Shipped       C100
1      102       B        Y    Pending       C200
2      103       A        X    Shipped       C100
3      104       C        X    Shipped       C300
4      101       A        X    Shipped       C100
5      105       B        Y    Pending       C400
6      106       C        Z  Cancelled       C200
7      107       D        Y    Shipped       C500


In [12]:
# Drop rows only if *all* values are missing (less common)
# df_dropped_all_nan = df.dropna(how='all')

In [13]:
# Drop rows that do not have at least 'thresh' non-NaN values
# Keep rows with at least 6 non-NaN values (out of 7 columns)

df_dropped_thresh = df.dropna(thresh=6)
print("\nDataFrame after dropping rows with less than 6 non-NaN values (.dropna(thresh=6)):\n", df_dropped_thresh)


DataFrame after dropping rows with less than 6 non-NaN values (.dropna(thresh=6)):
    OrderID Product Category  UnitPrice  Quantity     Status CustomerID
0      101       A        X      10.50       5.0    Shipped       C100
1      102       B        Y      25.00       2.0    Pending       C200
2      103       A        X      10.50       NaN    Shipped       C100
3      104       C        X       5.25      10.0    Shipped       C300
4      101       A        X      10.50       5.0    Shipped       C100
5      105       B        Y      25.00       3.0    Pending       C400
6      106       C        Z        NaN       7.0  Cancelled       C200
7      107       D        Y       8.00       4.0    Shipped       C500


In [14]:
# Drop NaNs only in specific columns using 'subset'
df_dropped_subset = df.dropna(subset=['UnitPrice', 'Quantity'])
print("\nDataFrame after dropping rows with NaN in 'UnitPrice' or 'Quantity' (.dropna(subset=...)):\n", df_dropped_subset)


DataFrame after dropping rows with NaN in 'UnitPrice' or 'Quantity' (.dropna(subset=...)):
    OrderID Product Category  UnitPrice  Quantity   Status CustomerID
0      101       A        X      10.50       5.0  Shipped       C100
1      102       B        Y      25.00       2.0  Pending       C200
3      104       C        X       5.25      10.0  Shipped       C300
4      101       A        X      10.50       5.0  Shipped       C100
5      105       B        Y      25.00       3.0  Pending       C400
7      107       D        Y       8.00       4.0  Shipped       C500


In [15]:
# c) Filling missing data (.fillna())
print("Filling missing data:")

# Fill all NaN with a specific value (e.g., 0)
df_filled_zero = df.fillna(0)

print("DataFrame after filling all NaN with 0 (.fillna(0)):\n", df_filled_zero[['UnitPrice', 'Quantity']].head())

Filling missing data:
DataFrame after filling all NaN with 0 (.fillna(0)):
    UnitPrice  Quantity
0      10.50       5.0
1      25.00       2.0
2      10.50       0.0
3       5.25      10.0
4      10.50       5.0


In [19]:
# Fill NaN in specific columns with different values
fill_values = {'UnitPrice': df['UnitPrice'].mean(), 'Quantity': df['Quantity'].median()}

df_filled_specific = df.fillna(value=fill_values)
print("\nDataFrame after filling NaN with column mean/median (.fillna(value=dict)):\n", df_filled_specific[['UnitPrice', 'Quantity']].head())


DataFrame after filling NaN with column mean/median (.fillna(value=dict)):
    UnitPrice  Quantity
0      10.50       5.0
1      25.00       2.0
2      10.50       5.0
3       5.25      10.0
4      10.50       5.0


In [22]:
# Forward fill (propagate last valid observation forward)
df_ffill = df.fillna(method='ffill') # or .ffill()
print("\nDataFrame after forward fill (.fillna(method='ffill')):\n", df_ffill[['UnitPrice', 'Quantity']].head())


DataFrame after forward fill (.fillna(method='ffill')):
    UnitPrice  Quantity
0      10.50       5.0
1      25.00       2.0
2      10.50       2.0
3       5.25      10.0
4      10.50       5.0


  df_ffill = df.fillna(method='ffill') # or .ffill()


In [23]:
# Backward fill (propagate next valid observation backward)
df_bfill = df.fillna(method='bfill') # or .bfill()
print("\nDataFrame after backward fill (.fillna(method='bfill')):\n", df_bfill[['UnitPrice', 'Quantity']].head())


DataFrame after backward fill (.fillna(method='bfill')):
    UnitPrice  Quantity
0      10.50       5.0
1      25.00       2.0
2      10.50      10.0
3       5.25      10.0
4      10.50       5.0


  df_bfill = df.fillna(method='bfill') # or .bfill()


In [24]:
# Limit the number of consecutive fills
df_ffill_limit = df.fillna(method='ffill', limit=1)
print("\nDataFrame after forward fill with limit=1:\n", df_ffill_limit[['UnitPrice', 'Quantity']].head())


DataFrame after forward fill with limit=1:
    UnitPrice  Quantity
0      10.50       5.0
1      25.00       2.0
2      10.50       2.0
3       5.25      10.0
4      10.50       5.0


  df_ffill_limit = df.fillna(method='ffill', limit=1)


In [25]:
# For this example, let's fill UnitPrice with mean and Quantity with median for further steps
df['UnitPrice'].fillna(df['UnitPrice'].mean(), inplace=True) # inplace=True modifies df directly
df['Quantity'].fillna(df['Quantity'].median(), inplace=True)

print("\nDataFrame after filling NaNs for subsequent steps:\n", df.head())


DataFrame after filling NaNs for subsequent steps:
    OrderID Product Category  UnitPrice  Quantity   Status CustomerID
0      101       A        X      10.50       5.0  Shipped       C100
1      102       B        Y      25.00       2.0  Pending       C200
2      103       A        X      10.50       5.0  Shipped       C100
3      104       C        X       5.25      10.0  Shipped       C300
4      101       A        X      10.50       5.0  Shipped       C100


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['UnitPrice'].fillna(df['UnitPrice'].mean(), inplace=True) # inplace=True modifies df directly
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Quantity'].fillna(df['Quantity'].median(), inplace=True)


#### 3. Handling Duplicates

In [26]:
print("--- Handling Duplicates ---")

# a) Identifying duplicate rows (.duplicated())
# Returns a boolean Series indicating which rows are duplicates
# By default, keeps the first occurrence ('first') as non-duplicate
print("Check for duplicate rows (.duplicated()):\n", df.duplicated())

# Identify duplicates based on specific columns
print("\nCheck for duplicates based on 'OrderID' (.duplicated(subset=['OrderID'])):\n", df.duplicated(subset=['OrderID']))

# Keep the last occurrence as non-duplicate
print("\nCheck for duplicates, keeping last (.duplicated(keep='last')):\n", df.duplicated(keep='last'))

# Mark all duplicates as True
print("\nMark all duplicates as True (.duplicated(keep=False)):\n", df.duplicated(keep=False))
print("-" * 20)

# b) Dropping duplicate rows (.drop_duplicates())
# Returns DataFrame with duplicates removed
# Keeps 'first' occurrence by default
df_no_duplicates = df.drop_duplicates()
print("DataFrame after dropping duplicate rows (.drop_duplicates()):\n", df_no_duplicates)
print(f"Original shape: {df.shape}, After drop_duplicates: {df_no_duplicates.shape}")

# Drop duplicates based on specific columns, keeping the last occurrence
df_no_dup_subset = df.drop_duplicates(subset=['OrderID', 'CustomerID'], keep='last')
print("\nDataFrame after dropping duplicates based on 'OrderID' & 'CustomerID', keeping last:\n", df_no_dup_subset)

# For subsequent steps, let's work with the version where full row duplicates are dropped
df = df.drop_duplicates().reset_index(drop=True) # Reset index after dropping
print("\nDataFrame after dropping full duplicates for subsequent steps:\n", df)
print("-" * 30)


# --- 4. Data Type Conversion (.astype()) ---

print("--- Data Type Conversion ---")
print("Original dtypes:\n", df.dtypes)

# Convert Quantity (currently float due to NaN fill) to integer
df['Quantity'] = df['Quantity'].astype(int)

# Convert OrderID to string (object)
df['OrderID'] = df['OrderID'].astype(str)

# Convert Category to a memory-efficient 'category' type
df['Category'] = df['Category'].astype('category')

print("\nData types after conversion (.astype()):\n", df.dtypes)
# Note: 'category' type can save memory and speed up operations like groupbys
print("\nCategory codes:\n", df['Category'].cat.codes) # Internal integer representation
print("Category values:\n", df['Category'].cat.categories) # Unique category values
print("-" * 30)


# --- 5. Renaming Columns & Index (.rename()) ---

print("--- Renaming Columns & Index ---")
# Rename specific columns using a dictionary
df_renamed = df.rename(columns={'UnitPrice': 'Price_per_Unit', 'CustomerID': 'CustID'})
print("DataFrame after renaming columns:\n", df_renamed.columns)

# Rename index labels (if index wasn't reset) - using a function/lambda
# df.index = pd.RangeIndex(start=1000, stop=1000+len(df)) # Example: Set a new index first
# df_renamed_index = df.rename(index=lambda x: f"Row_{x}")
# print("\nDataFrame after renaming index:\n", df_renamed_index.head())

# Rename can also use inplace=True
# df.rename(columns={'Status': 'Order_Status'}, inplace=True)
# print("\nOriginal DataFrame columns after inplace rename:\n", df.columns)
print("-" * 30)


# --- 6. Applying Functions ---

print("--- Applying Functions ---")

# a) .map() (Series method - element-wise, often for substitution/mapping)
# Create a mapping for Status
status_map = {'Shipped': 1, 'Pending': 0, 'Cancelled': -1}
df['Status_Code'] = df['Status'].map(status_map)
print("DataFrame with Status mapped to codes (.map()):\n", df[['Status', 'Status_Code']].head())

# b) .apply() (DataFrame method - row-wise or column-wise)
# Apply a function along an axis (axis=0 for columns, axis=1 for rows)

# Example: Calculate Total Price (row-wise)
def calculate_total(row):
    # Ensure Price_per_Unit exists if using df_renamed, else use UnitPrice
    price_col = 'Price_per_Unit' if 'Price_per_Unit' in row.index else 'UnitPrice'
    return row[price_col] * row['Quantity']

# Use the original df for this example
df['Total_Price'] = df.apply(calculate_total, axis=1)
print("\nDataFrame with Total_Price calculated using .apply(axis=1):\n", df[['UnitPrice', 'Quantity', 'Total_Price']].head())

# Example: Calculate range (max-min) for numerical columns (column-wise)
numerical_cols = df.select_dtypes(include=np.number) # Select only numerical columns
column_ranges = numerical_cols.apply(lambda x: x.max() - x.min(), axis=0) # axis=0 is default
print("\nRange (max-min) for numerical columns using .apply(axis=0):\n", column_ranges)

# c) .applymap() (DataFrame method - element-wise) - Use less often, consider .map or vectorization
# Example: Convert all string columns to uppercase (if they are strings)
# def to_upper_if_str(x):
#     return x.upper() if isinstance(x, str) else x
# df_upper = df.applymap(to_upper_if_str) # Apply to every element
# print("\nDataFrame after applying uppercase function with applymap:\n", df_upper.head())
print("-" * 30)


# --- 7. Replacing Values (.replace()) ---

print("--- Replacing Values ---")
# Replace a single value across the whole DataFrame
df_replaced_A = df.replace('A', 'Product_A')
print("DataFrame after replacing 'A' with 'Product_A':\n", df_replaced_A[['Product']].head())

# Replace multiple values with a single value
df_replaced_status = df.replace(['Pending', 'Cancelled'], 'Not Shipped')
print("\nDataFrame after replacing 'Pending'/'Cancelled' with 'Not Shipped':\n", df_replaced_status[['Status']].head())

# Replace multiple values with different values using a dictionary
df_replaced_dict = df.replace({'Category': {'X': 'Cat_X', 'Y': 'Cat_Y'}, 'Status': {'Shipped': 'Complete'}})
print("\nDataFrame after replacing using dictionary:\n", df_replaced_dict[['Category', 'Status']].head())

# Replace using regular expressions (regex=True)
# df_replaced_regex = df.replace(r'^C(\d+)$', r'Customer_\1', regex=True) # Example: Replace C100 -> Customer_100
# print("\nDataFrame after replacing using regex:\n", df_replaced_regex[['CustomerID']].head())
print("-" * 30)


# --- 8. Sorting (.sort_values(), .sort_index()) ---

print("--- Sorting ---")
# Sort by a single column (.sort_values())
df_sorted_sales = df.sort_values(by='UnitPrice', ascending=False) # Sort by UnitPrice descending
print("DataFrame sorted by UnitPrice descending:\n", df_sorted_sales.head())

# Sort by multiple columns
df_sorted_multi = df.sort_values(by=['Category', 'Product']) # Sort by Category, then Product
print("\nDataFrame sorted by Category, then Product:\n", df_sorted_multi)

# Sort by index (.sort_index())
df_sorted_index = df.sort_index(ascending=False) # Sort by index descending
print("\nDataFrame sorted by index descending:\n", df_sorted_index.head())

# Sorting can also use inplace=True
# df.sort_values(by='Quantity', inplace=True)
print("-" * 30)

--- Handling Duplicates ---
Check for duplicate rows (.duplicated()):
 0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

Check for duplicates based on 'OrderID' (.duplicated(subset=['OrderID'])):
 0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

Check for duplicates, keeping last (.duplicated(keep='last')):
 0     True
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool

Mark all duplicates as True (.duplicated(keep=False)):
 0     True
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool
--------------------
DataFrame after dropping duplicate rows (.drop_duplicates()):
    OrderID Product Category  UnitPrice  Quantity     Status CustomerID
0      101       A        X  10.500000       5.0    Shipped       C100
1      102       B        Y  25.000000       2.0    Pending       C200
2      103       A        X  10.5

  df_replaced_dict = df.replace({'Category': {'X': 'Cat_X', 'Y': 'Cat_Y'}, 'Status': {'Shipped': 'Complete'}})
