In [1]:

# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation
# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.
# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.
# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.
import pandas as pd
import numpy as np
def drop_missing_data(file_path="employees.csv"):
    """
    Loads a dataset, inspects it for missing values, drops rows with any missing data,
    and saves the result to a new CSV file.

    Args:
        file_path (str, optional): The path to the CSV file. Defaults to "employees.csv".
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Creating a sample DataFrame instead.")
        df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
            'Age': [25, 30, None, 22, 28],
            'Salary': [50000, None, 60000, 45000, 55000],
            'Department': ['HR', 'Engineering', 'Engineering', 'Sales', None]
        })
    print("\nOriginal DataFrame:")
    print(df)
    print("\nMissing values before dropping:")
    print(df.isnull().sum())
    df_dropped = df.dropna()
    print("\nDataFrame after dropping rows with missing values:")
    print(df_dropped)
    print("\nMissing values after dropping:")
    print(df_dropped.isnull().sum())
    df_dropped.to_csv("employees_dropped.csv", index=False)  
    print("\nDropped data saved to employees_dropped.csv")
    return df_dropped 
def impute_mean(file_path="employees.csv"):
    """
    Loads a dataset, fills missing numerical values with the column mean,
    and saves the modified data to a new CSV file.

    Args:
        file_path (str, optional): The path to the CSV file. Defaults to "employees.csv".
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Creating a sample DataFrame instead.")
        df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
            'Age': [25, 30, None, 22, 28],
            'Salary': [50000, None, 60000, 45000, 55000],
            'Department': ['HR', 'Engineering', 'Engineering', 'Sales', None]
        })
    print("\nOriginal DataFrame:")
    print(df)
    print("\nMissing values before imputation:")
    print(df.isnull().sum())
    df_mean_imputed = df.copy() 
    for col in df_mean_imputed.select_dtypes(include=np.number).columns:
        df_mean_imputed[col] = df_mean_imputed[col].fillna(df_mean_imputed[col].mean())
    print("\nDataFrame after imputing numerical columns with the mean:")
    print(df_mean_imputed)
    print("\nMissing values after mean imputation:")
    print(df_mean_imputed.isnull().sum())
    df_mean_imputed.to_csv("employees_mean_imputed.csv", index=False)
    print("\nMean imputed data saved to employees_mean_imputed.csv")
    return df_mean_imputed 
def impute_median_mode(file_path="employees.csv"):
    """
    Loads a dataset, fills missing numerical values with the column median,
    fills missing categorical values with the column mode,
    and saves the modified data to a new CSV file.

    Args:
        file_path (str, optional): The path to the CSV file. Defaults to "employees.csv".
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Creating a sample DataFrame instead.")
        df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
            'Age': [25, 30, None, 22, 28],
            'Salary': [50000, None, 60000, 45000, 55000],
            'Department': ['HR', 'Engineering', 'Engineering', 'Sales', None]
        })
    print("\nOriginal DataFrame:")
    print(df)
    print("\nMissing values before imputation:")
    print(df.isnull().sum())
    df_median_mode_imputed = df.copy() 
    for col in df_median_mode_imputed.select_dtypes(include=np.number).columns:
        df_median_mode_imputed[col] = df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].median())
    for col in df_median_mode_imputed.select_dtypes(include=['object', 'category']).columns:
        df_median_mode_imputed[col] = df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].mode()[0])
    print("\nDataFrame after imputing numerical columns with the median and categorical with the mode:")
    print(df_median_mode_imputed)
    print("\nMissing values after median/mode imputation:")
    print(df_median_mode_imputed.isnull().sum())
    df_median_mode_imputed.to_csv("employees_median_mode_imputed.csv", index=False)  # Save to a new file
    print("\nMedian/Mode imputed data saved to employees_median_mode_imputed.csv")
    return df_median_mode_imputed 
if __name__ == "__main__":
    file_path = "employees.csv" 
    try:
        with open(file_path, 'r') as f:
            pass
    except FileNotFoundError:
        print(f"Creating sample CSV file: {file_path}")
        sample_df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack'],
            'Age': [25, 30, None, 22, 28, 35, None, 40, 24, 29],
            'Salary': [50000, None, 60000, 45000, 55000, 70000, 65000, None, 48000, 52000],
            'Department': ['HR', 'Engineering', 'Engineering', 'Sales', None, 'Marketing', 'Marketing', 'Finance', 'HR', 'Sales'],
            'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles', 'Houston', 'Houston', 'New York', 'Chicago', 'Los Angeles']
        })
        sample_df.to_csv(file_path, index=False)
    drop_missing_data_df = drop_missing_data(file_path)
    impute_mean_df = impute_mean(file_path)
    impute_median_mode_df = impute_median_mode(file_path)
    print("\nReturned DataFrame from drop_missing_data():")
    print(drop_missing_data_df)
    print("\nReturned DataFrame from impute_mean():")
    print(impute_mean_df)
    print("\nReturned DataFrame from impute_median_mode():")
    print(impute_median_mode_df)
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.

# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.
# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
def create_sample_dataframe():
    """
    Creates a sample Pandas DataFrame with missing values for demonstration.
    """
    data = {
        'Age': [25, 30, None, 22, 28, 35, None, 40, 24, 29],
        'Salary': [50000, None, 60000, 45000, 55000, 70000, 65000, None, 48000, 52000],
        'Department': ['HR', 'Engineering', 'Engineering', 'Sales', None, 'Marketing', 'Marketing', 'Finance', 'HR', 'Sales'],
        'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles', 'Houston', 'Houston', 'New York', 'Chicago', 'Los Angeles']
    }
    return pd.DataFrame(data)
def impute_with_simple_imputer(df):
    """
    Imputes missing values using scikit-learn's SimpleImputer with the mean strategy
    for numerical columns and the most frequent strategy for categorical columns.

    Args:
        df (pd.DataFrame): The input DataFrame with missing values.

    Returns:
        pd.DataFrame: A new DataFrame with missing values imputed.
    """
    df_imputed = df.copy() 
    numerical_cols = df_imputed.select_dtypes(include=np.number).columns
    imputer_numerical = SimpleImputer(strategy='mean')
    df_imputed[numerical_cols] = imputer_numerical.fit_transform(df_imputed[numerical_cols])
    categorical_cols = df_imputed.select_dtypes(include=['object', 'category']).columns
    imputer_categorical = SimpleImputer(strategy='most_frequent')
    df_imputed[categorical_cols] = imputer_categorical.fit_transform(df_imputed[categorical_cols])
    print("\nDataFrame after imputation with SimpleImputer (mean for numerical, mode for categorical):")
    print(df_imputed)
    print("\nMissing values after SimpleImputer imputation:")
    print(df_imputed.isnull().sum())
    return df_imputed
def impute_with_regression(df):
    """
    Imputes missing values in the 'Salary' column using a linear regression model.

    Args:
        df (pd.DataFrame): The input DataFrame with missing values.

    Returns:
        pd.DataFrame: A new DataFrame with missing values imputed in 'Salary'.
    """
    df_regression_imputed = df.copy() 
    salary_df = df_regression_imputed[['Age', 'Salary']].copy() 
    train_data = salary_df.dropna()
    X_train = train_data[['Age']]
    y_train = train_data['Salary']
    model = LinearRegression()
    model.fit(X_train, y_train)
    missing_salary_index = salary_df[salary_df['Salary'].isnull()].index
    X_predict = salary_df.loc[missing_salary_index][['Age']]
    predicted_salaries = model.predict(X_predict)
    df_regression_imputed.loc[missing_salary_index, 'Salary'] = predicted_salaries
    print("\nDataFrame after imputation with Regression Model (Salary):")
    print(df_regression_imputed)
    print("\nMissing values after Regression imputation:")
    print(df_regression_imputed.isnull().sum())
    return df_regression_imputed
def impute_with_knn(df):
    """
    Imputes missing values using scikit-learn's KNNImputer.

    Args:
        df (pd.DataFrame): The input DataFrame with missing values.

    Returns:
        pd.DataFrame: A new DataFrame with missing values imputed.
    """
    df_knn_imputed = df.copy() 
    numerical_cols = df_knn_imputed.select_dtypes(include=np.number).columns
    knn_imputer = KNNImputer(n_neighbors=2)
    df_knn_imputed[numerical_cols] = knn_imputer.fit_transform(df_knn_imputed[numerical_cols])
    categorical_cols = df_knn_imputed.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols:
        df_knn_imputed[col] = df_knn_imputed[col].fillna(df_knn_imputed[col].mode()[0])
    print("\nDataFrame after imputation with KNNImputer:")
    print(df_knn_imputed)
    print("\nMissing values after KNN Imputer imputation:")
    print(df_knn_imputed.isnull().sum())
    return df_knn_imputed
def main():
    """
    Main function to run the imputation examples.
    """
    df = create_sample_dataframe()
    imputed_simple_imputer_df = impute_with_simple_imputer(df)
    imputed_regression_df = impute_with_regression(df)
    imputed_knn_df = impute_with_knn(df)
    print("\n\nReturned DataFrame from impute_with_simple_imputer():")
    print(imputed_simple_imputer_df)
    print("\n\nReturned DataFrame from impute_with_regression():")
    print(imputed_regression_df)
    print("\n\nReturned DataFrame from impute_with_knn():")
    print(imputed_knn_df)
if __name__ == "__main__":
    main()

Creating sample CSV file: employees.csv

Original DataFrame:
      Name   Age   Salary   Department         City
0    Alice  25.0  50000.0           HR     New York
1      Bob  30.0      NaN  Engineering  Los Angeles
2  Charlie   NaN  60000.0  Engineering     New York
3    David  22.0  45000.0        Sales      Chicago
4      Eve  28.0  55000.0          NaN  Los Angeles
5    Frank  35.0  70000.0    Marketing      Houston
6    Grace   NaN  65000.0    Marketing      Houston
7    Henry  40.0      NaN      Finance     New York
8      Ivy  24.0  48000.0           HR      Chicago
9     Jack  29.0  52000.0        Sales  Los Angeles

Missing values before dropping:
Name          0
Age           2
Salary        2
Department    1
City          0
dtype: int64

DataFrame after dropping rows with missing values:
    Name   Age   Salary Department         City
0  Alice  25.0  50000.0         HR     New York
3  David  22.0  45000.0      Sales      Chicago
5  Frank  35.0  70000.0  Marketing      Houst

In [2]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.




