In [None]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



In [None]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.

In [None]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.



In [None]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.



In [None]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.



In [None]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.



In [None]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.



In [None]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.



In [None]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.




In [None]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.



In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Sample data to simulate CSV loading
data = {
    'Age': [25, np.nan, 35, 40, np.nan, 28, 33],
    'Gender': ['Male', 'Female', np.nan, 'Female', 'Male', np.nan, 'Male'],
    'Income': [50000, 60000, np.nan, 80000, 70000, 65000, np.nan],
    'Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', None, '2023-01-05', '2023-01-06', '2023-01-07'])
}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# 1. Identify missing values
print("\nMissing Values (isnull):\n", df.isnull())
print("\nCount Missing Values per Column:\n", df.isnull().sum())

# 2. Drop rows with any missing values
df_drop_rows = df.dropna()
print("\nAfter Dropping Rows with Missing Values:\n", df_drop_rows)

# 3. Drop columns with any missing values
df_drop_cols = df.dropna(axis=1)
print("\nAfter Dropping Columns with Missing Values:\n", df_drop_cols)

# 4. Mean imputation for numerical column 'Age'
df['Age_mean_imputed'] = df['Age'].fillna(df['Age'].mean())
print("\nAge Column After Mean Imputation:\n", df['Age_mean_imputed'])

# 5. Mode imputation for categorical column 'Gender'
mode_gender = df['Gender'].mode()[0]
df['Gender_mode_imputed'] = df['Gender'].fillna(mode_gender)
print("\nGender Column After Mode Imputation:\n", df['Gender_mode_imputed'])

# 6. Median imputation for skewed numerical column 'Income'
df['Income_median_imputed'] = df['Income'].fillna(df['Income'].median())
print("\nIncome Column After Median Imputation:\n", df['Income_median_imputed'])

# 7. KNN Imputation for numerical columns (Age, Income)
knn_imputer = KNNImputer(n_neighbors=2)
knn_imputed = knn_imputer.fit_transform(df[['Age', 'Income']])
df[['Age_knn_imputed', 'Income_knn_imputed']] = knn_imputed
print("\nAfter KNN Imputation:\n", df[['Age_knn_imputed', 'Income_knn_imputed']])

# 8. Detect and handle missing categorical data with next frequent category (mode)
# Here, using mode again for demonstration (next frequent category)
df['Gender_next_freq'] = df['Gender'].fillna(df['Gender'].mode()[0])
print("\nGender after Imputation with Next Frequent Category:\n", df['Gender_next_freq'])

# 9. Predictive Modeling for Imputation (e.g., imputing missing 'Income' using 'Age')
# Prepare data
train_data = df[df['Income'].notna()]
test_data = df[df['Income'].isna()]

if not test_data.empty:
    model = LinearRegression()
    model.fit(train_data[['Age']].fillna(train_data['Age'].mean()), train_data['Income'])
    predicted_income = model.predict(test_data[['Age']].fillna(train_data['Age'].mean()))
    df.loc[df['Income'].isna(), 'Income_predicted'] = predicted_income
    print("\nIncome after Predictive Modeling Imputation:\n", df[['Income', 'Income_predicted']])
else:
    print("\nNo missing Income values for predictive modeling imputation.")

# 10. Handling Time Series Data with Forward and Backward Fill
df_sorted = df.sort_values('Date')
df_sorted['Income_ffill'] = df_sorted['Income'].fillna(method='ffill')
df_sorted['Income_bfill'] = df_sorted['Income'].fillna(method='bfill')
print("\nTime Series Data with Forward and Backward Fill:\n", df_sorted[['Date', 'Income', 'Income_ffill', 'Income_bfill']])


Original Data:
     Age  Gender   Income       Date
0  25.0    Male  50000.0 2023-01-01
1   NaN  Female  60000.0 2023-01-02
2  35.0     NaN      NaN 2023-01-03
3  40.0  Female  80000.0        NaT
4   NaN    Male  70000.0 2023-01-05
5  28.0     NaN  65000.0 2023-01-06
6  33.0    Male      NaN 2023-01-07

Missing Values (isnull):
      Age  Gender  Income   Date
0  False   False   False  False
1   True   False   False  False
2  False    True    True  False
3  False   False   False   True
4   True   False   False  False
5  False    True   False  False
6  False   False    True  False

Count Missing Values per Column:
 Age       2
Gender    2
Income    2
Date      1
dtype: int64

After Dropping Rows with Missing Values:
     Age Gender   Income       Date
0  25.0   Male  50000.0 2023-01-01

After Dropping Columns with Missing Values:
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6]

Age Column After Mean Imputation:
 0    25.0
1    32.2
2    35.0
3    40.0
4    32.2
5    28.0
6    

  df_sorted['Income_ffill'] = df_sorted['Income'].fillna(method='ffill')
  df_sorted['Income_bfill'] = df_sorted['Income'].fillna(method='bfill')


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Function to load data (simulate loading here)
def load_data():
    data = {
        'Age': [25, np.nan, 35, 40, np.nan, 28, 33],
        'Gender': ['Male', 'Female', np.nan, 'Female', 'Male', np.nan, 'Male'],
        'Income': [50000, 60000, np.nan, 80000, 70000, 65000, np.nan],
        'Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', None, '2023-01-05', '2023-01-06', '2023-01-07'])
    }
    df = pd.DataFrame(data)
    return df

# Function to identify missing values
def identify_missing(df):
    print("Missing Values:\n", df.isnull())
    print("\nCount Missing Values per Column:\n", df.isnull().sum())

# Function to drop rows or columns with missing values
def drop_missing(df, axis=0):
    # axis=0 drops rows, axis=1 drops columns
    return df.dropna(axis=axis)

# Function to fill missing numerical values with mean, median or mode
def impute_numerical(df, column, method='mean'):
    if method == 'mean':
        val = df[column].mean()
    elif method == 'median':
        val = df[column].median()
    elif method == 'mode':
        val = df[column].mode()[0]
    else:
        raise ValueError("Method must be one of ['mean','median','mode']")
    df[column + f'_{method}_imputed'] = df[column].fillna(val)
    return df

# Function to fill missing categorical data with mode or next frequent category
def impute_categorical(df, column, method='mode'):
    if method == 'mode':
        val = df[column].mode()[0]
    else:
        raise ValueError("Currently only 'mode' method is supported for categorical")
    df[column + f'_{method}_imputed'] = df[column].fillna(val)
    return df

# Function for KNN imputation for numerical columns
def knn_imputation(df, columns, n_neighbors=2):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_knn = df.copy()
    df_knn[columns] = imputer.fit_transform(df_knn[columns])
    for col in columns:
        df[col + '_knn_imputed'] = df_knn[col]
    return df

# Function for predictive model imputation of missing values
def predictive_imputation(df, target_col, feature_cols):
    train = df[df[target_col].notna()]
    test = df[df[target_col].isna()]
    if test.empty:
        print(f"No missing values in {target_col} for predictive imputation.")
        return df
    model = LinearRegression()
    X_train = train[feature_cols].fillna(train[feature_cols].mean())
    y_train = train[target_col]
    model.fit(X_train, y_train)
    X_test = test[feature_cols].fillna(train[feature_cols].mean())
    preds = model.predict(X_test)
    df.loc[df[target_col].isna(), target_col + '_predicted'] = preds
    return df

# Function to fill missing time series data with forward/backward fill
def time_series_fill(df, date_col, fill_col):
    df_sorted = df.sort_values(date_col)
    df_sorted[fill_col + '_ffill'] = df_sorted[fill_col].fillna(method='ffill')
    df_sorted[fill_col + '_bfill'] = df_sorted[fill_col].fillna(method='bfill')
    return df_sorted

# Main driver function
def main():
    df = load_data()
    print("Original Data:\n", df)
    
    print("\n=== Missing Value Identification ===")
    identify_missing(df)
    
    print("\n=== Dropping Rows with Missing Values ===")
    print(drop_missing(df, axis=0))
    
    print("\n=== Dropping Columns with Missing Values ===")
    print(drop_missing(df, axis=1))
    
    print("\n=== Mean Imputation for 'Age' ===")
    df = impute_numerical(df, 'Age', 'mean')
    print(df[['Age', 'Age_mean_imputed']])
    
    print("\n=== Mode Imputation for 'Gender' ===")
    df = impute_categorical(df, 'Gender', 'mode')
    print(df[['Gender', 'Gender_mode_imputed']])
    
    print("\n=== Median Imputation for 'Income' ===")
    df = impute_numerical(df, 'Income', 'median')
    print(df[['Income', 'Income_median_imputed']])
    
    print("\n=== KNN Imputation for ['Age', 'Income'] ===")
    df = knn_imputation(df, ['Age', 'Income'])
    print(df[['Age_knn_imputed', 'Income_knn_imputed']])
    
    print("\n=== Predictive Imputation for 'Income' using 'Age' ===")
    df = predictive_imputation(df, 'Income', ['Age'])
    print(df[['Income', 'Income_predicted']])
    
    print("\n=== Time Series Forward and Backward Fill for 'Income' ===")
    df_ts = time_series_fill(df, 'Date', 'Income')
    print(df_ts[['Date', 'Income', 'Income_ffill', 'Income_bfill']])

if __name__ == "__main__":
    main()


Original Data:
     Age  Gender   Income       Date
0  25.0    Male  50000.0 2023-01-01
1   NaN  Female  60000.0 2023-01-02
2  35.0     NaN      NaN 2023-01-03
3  40.0  Female  80000.0        NaT
4   NaN    Male  70000.0 2023-01-05
5  28.0     NaN  65000.0 2023-01-06
6  33.0    Male      NaN 2023-01-07

=== Missing Value Identification ===
Missing Values:
      Age  Gender  Income   Date
0  False   False   False  False
1   True   False   False  False
2  False    True    True  False
3  False   False   False   True
4   True   False   False  False
5  False    True   False  False
6  False   False    True  False

Count Missing Values per Column:
 Age       2
Gender    2
Income    2
Date      1
dtype: int64

=== Dropping Rows with Missing Values ===
    Age Gender   Income       Date
0  25.0   Male  50000.0 2023-01-01

=== Dropping Columns with Missing Values ===
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6]

=== Mean Imputation for 'Age' ===
    Age  Age_mean_imputed
0  25.0     

  df_sorted[fill_col + '_ffill'] = df_sorted[fill_col].fillna(method='ffill')
  df_sorted[fill_col + '_bfill'] = df_sorted[fill_col].fillna(method='bfill')
