In [4]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



import pandas as pd

# Step 1: Load the data
# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('your_file.csv')

# Step 2: Check for missing values
missing_values = df.isnull()

# Step 3: Summarize missing data
missing_summary = df.isnull().sum()

# Display results
print("Missing Values (True means missing):")
print(missing_values)

print("\nSummary of Missing Values in Each Column:")
print(missing_summary)


FileNotFoundError: [Errno 2] No such file or directory: 'your_file.csv'

In [None]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.



import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, None, 30, 22],
    "Email": ["alice@example.com", "bob@example.com", None, "david@example.com"]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Drop rows with any missing values
clean_df = df.dropna()

print("\nDataFrame after dropping rows with missing values:")
print(clean_df)


Original DataFrame:
      Name   Age              Email
0    Alice  25.0  alice@example.com
1      Bob   NaN    bob@example.com
2  Charlie  30.0               None
3    David  22.0  david@example.com

DataFrame after dropping rows with missing values:
    Name   Age              Email
0  Alice  25.0  alice@example.com
3  David  22.0  david@example.com


In [None]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.

import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, None, 30],
    "Email": ["alice@example.com", "bob@example.com", None]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Drop columns with any missing values
clean_df = df.dropna(axis=1)

print("\nDataFrame after dropping columns with missing values:")
print(clean_df)

In [None]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, None, 30, 22]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate the mean of the 'Age' column
mean_age = df["Age"].mean()

# Step 2: Fill missing values with the mean
df["Age"] = df["Age"].fillna(mean_age)

print("\nDataFrame after mean imputation:")
print(df)

In [None]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.

import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Department": ["HR", None, "HR", "IT"]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate the mode of the 'Department' column
mode_department = df["Department"].mode()[0]

# Step 2: Fill missing values with the mode
df["Department"] = df["Department"].fillna(mode_department)

print("\nDataFrame after mode imputation:")
print(df)

In [None]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.

import pandas as pd

# Sample DataFrame with skewed data
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Income": [30000, 45000, None, 120000, 35000]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate the median of the 'Income' column
median_income = df["Income"].median()

# Step 2: Fill missing values with the median
df["Income"] = df["Income"].fillna(median_income)

print("\nDataFrame after median imputation:")
print(df)

In [None]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.

import pandas as pd
from sklearn.impute import KNNImputer

# Sample DataFrame
data = {
    "Age": [25, 30, 28, None, 35],
        "Salary": [50000, 54000, None, 58000, 62000]
        }

        df = pd.DataFrame(data)

        print("Original DataFrame:")
        print(df)

        # Step 3: Apply KNN Imputer
        imputer = KNNImputer(n_neighbors=2)  # You can choose k value as needed

        # Transform the data (returns a NumPy array)
        imputed_array = imputer.fit_transform(df)

        # Convert back to DataFrame
        imputed_df = pd.DataFrame(imputed_array, columns=df.columns)

        print("\nDataFrame after KNN Imputation:")
        print(imputed_df)

In [None]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.

import pandas as pd

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva", "Frank"],
    "Department": ["HR", None, "IT", "HR", "IT", None]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Detect missing values in the 'Department' column
missing_values = df["Department"].isnull()
print("\nMissing values in 'Department' column:")
print(missing_values)

# Step 2: Impute with the next most frequent (second mode) category
# Get all modes
modes = df["Department"].mode()

# Check if there is more than one mode
if len(modes) > 1:
    second_mode = modes[1]
else:
    second_mode = modes[0]  # If only one mode, fall back to it

# Fill missing values
df["Department"] = df["Department"].fillna(second_mode)

print("\nDataFrame after filling missing categorical values with the next frequent category:")
print(df)

In [None]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.

import pandas as pd
from sklearn.linear_model import LinearRegression

# Step 1: Sample dataset with missing values
data = {
    "Age": [25, 30, None, 40, None, 35],
    "Salary": [40000, 50000, 45000, 80000, 52000, 60000]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 2: Partition the data
# Train data (non-missing Age)
train_data = df[df["Age"].notnull()]
# Test data (missing Age)
test_data = df[df["Age"].isnull()]

# Step 3: Train a regression model to predict Age using Salary
model = LinearRegression()
model.fit(train_data[["Salary"]], train_data["Age"])

# Step 4: Predict missing Age values
predicted_ages = model.predict(test_data[["Salary"]])

# Step 5: Fill in the missing Age values
df.loc[df["Age"].isnull(), "Age"] = predicted_ages

print("\nDataFrame after predictive imputation:")
print(df)


In [None]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

import pandas as pd

# Sample time series data with missing values
data = {
    "Date": ["2025-01-01", "2025-01-02", "2025-01-03", "2025-01-04", "2025-01-05"],
    "Temperature": [20.5, None, None, 22.0, None]
}

df = pd.DataFrame(data)

# Step 1: Convert 'Date' to datetime and sort
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date")

print("Original DataFrame:")
print(df)

# Step 2: Forward fill (ffill) missing values
df_ffill = df.copy()
df_ffill["Temperature"] = df_ffill["Temperature"].fillna(method="ffill")

print("\nDataFrame after Forward Fill (ffill):")
print(df_ffill)

# Step 3: Backward fill (bfill) missing values
df_bfill = df.copy()
df_bfill["Temperature"] = df_bfill["Temperature"].fillna(method="bfill")

print("\nDataFrame after Backward Fill (bfill):")
print(df_bfill)