In [None]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.







In [1]:
import pandas as pd
import numpy as np

# Simulate loading a dataset (replace this with pd.read_csv('employees.csv') if you have the file)
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Eve'],
    'Age': [25, np.nan, 30, 45, np.nan],
    'Department': ['HR', 'Finance', np.nan, 'IT', 'HR'],
    'Salary': [50000, 60000, 70000, np.nan, 80000]
}
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

# ------------------------------
# 1. Dropping Missing Data
# ------------------------------
df_dropped = df.dropna()
print("\nAfter Dropping Rows with Missing Values:")
print(df_dropped)
df_dropped.to_csv("employees_dropped.csv", index=False)

# ------------------------------
# 2. Imputation using Mean
# ------------------------------
df_mean_imputed = df.copy()
for col in df_mean_imputed.select_dtypes(include=[np.number]):
    df_mean_imputed[col].fillna(df_mean_imputed[col].mean(), inplace=True)

print("\nAfter Imputation using Mean (Numeric Columns):")
print(df_mean_imputed)
df_mean_imputed.to_csv("employees_mean_imputed.csv", index=False)

# ------------------------------
# 3. Imputation using Median and Mode
# ------------------------------
df_med_mode_imputed = df.copy()

# Median for numerical columns
for col in df_med_mode_imputed.select_dtypes(include=[np.number]):
    df_med_mode_imputed[col].fillna(df_med_mode_imputed[col].median(), inplace=True)

# Mode for categorical columns
for col in df_med_mode_imputed.select_dtypes(include=[object]):
    df_med_mode_imputed[col].fillna(df_med_mode_imputed[col].mode()[0], inplace=True)

print("\nAfter Imputation using Median (Numerical) and Mode (Categorical):")
print(df_med_mode_imputed)
df_med_mode_imputed.to_csv("employees_med_mode_imputed.csv", index=False)


Original Dataset:
   EmployeeID     Name   Age Department   Salary
0           1    Alice  25.0         HR  50000.0
1           2      Bob   NaN    Finance  60000.0
2           3  Charlie  30.0        NaN  70000.0
3           4      NaN  45.0         IT      NaN
4           5      Eve   NaN         HR  80000.0

After Dropping Rows with Missing Values:
   EmployeeID   Name   Age Department   Salary
0           1  Alice  25.0         HR  50000.0

After Imputation using Mean (Numeric Columns):
   EmployeeID     Name        Age Department   Salary
0           1    Alice  25.000000         HR  50000.0
1           2      Bob  33.333333    Finance  60000.0
2           3  Charlie  30.000000        NaN  70000.0
3           4      NaN  45.000000         IT  65000.0
4           5      Eve  33.333333         HR  80000.0

After Imputation using Median (Numerical) and Mode (Categorical):
   EmployeeID     Name   Age Department   Salary
0           1    Alice  25.0         HR  50000.0
1           2  

In [None]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.






In [4]:
# ------------------------------
# 5. Imputation using Regression Model (for 'Age')
# ------------------------------
df_reg = df.copy()

# 1. Train on rows with complete Age, Salary, and Experience
df_reg_train = df_reg.dropna(subset=['Age', 'Salary', 'Experience'])

X_train = df_reg_train[['Salary', 'Experience']]
y_train = df_reg_train['Age']
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# 2. Predict for rows where Age is missing, and predictors are not missing
df_reg_missing = df_reg[df_reg['Age'].isnull()].dropna(subset=['Salary', 'Experience'])

if not df_reg_missing.empty:
    X_missing = df_reg_missing[['Salary', 'Experience']]
    predicted_ages = reg_model.predict(X_missing)
    df_reg.loc[df_reg_missing.index, 'Age'] = predicted_ages

print("\nAfter Regression Imputation (Age):")
print(df_reg)




After Regression Imputation (Age):
    Age   Salary  Experience
0  25.0  50000.0         1.0
1  27.5  60000.0         3.0
2  30.0  70000.0         5.0
3  45.0      NaN         8.0
4   NaN  80000.0         NaN
