In [6]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.







In [7]:
import pandas as pd
import numpy as np

# Step 1: Create a sample DataFrame with missing values
data = {
    'EmployeeID': [101, 102, 103, 104, 105],
    'Age': [25, np.nan, 30, 22, np.nan],
    'Department': ['HR', 'IT', None, 'Finance', 'IT'],
    'Salary': [50000, 60000, np.nan, 45000, 52000]
}

df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)

# Step 2: Dropping rows with any missing data
df_dropped = df.dropna()
print("\nDataFrame after Dropping Rows with Missing Data:")
print(df_dropped)

# Step 3: Imputation using Mean for numerical columns
df_mean_imputed = df.copy()
df_mean_imputed['Age'].fillna(df_mean_imputed['Age'].mean(), inplace=True)
df_mean_imputed['Salary'].fillna(df_mean_imputed['Salary'].mean(), inplace=True)
print("\nDataFrame after Mean Imputation:")
print(df_mean_imputed)

# Step 4: Imputation using Median for numerical and Mode for categorical columns
df_median_mode_imputed = df.copy()

# Median for numerical columns
df_median_mode_imputed['Age'].fillna(df_median_mode_imputed['Age'].median(), inplace=True)
df_median_mode_imputed['Salary'].fillna(df_median_mode_imputed['Salary'].median(), inplace=True)

# Mode for categorical columns
mode_department = df_median_mode_imputed['Department'].mode()[0]
df_median_mode_imputed['Department'].fillna(mode_department, inplace=True)

print("\nDataFrame after Median and Mode Imputation:")
print(df_median_mode_imputed)


Original DataFrame with Missing Values:
   EmployeeID   Age Department   Salary
0         101  25.0         HR  50000.0
1         102   NaN         IT  60000.0
2         103  30.0       None      NaN
3         104  22.0    Finance  45000.0
4         105   NaN         IT  52000.0

DataFrame after Dropping Rows with Missing Data:
   EmployeeID   Age Department   Salary
0         101  25.0         HR  50000.0
3         104  22.0    Finance  45000.0

DataFrame after Mean Imputation:
   EmployeeID        Age Department   Salary
0         101  25.000000         HR  50000.0
1         102  25.666667         IT  60000.0
2         103  30.000000       None  51750.0
3         104  22.000000    Finance  45000.0
4         105  25.666667         IT  52000.0

DataFrame after Median and Mode Imputation:
   EmployeeID   Age Department   Salary
0         101  25.0         HR  50000.0
1         102  25.0         IT  60000.0
2         103  30.0         IT  51000.0
3         104  22.0    Finance  45000.0
4

In [8]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.






In [9]:
import pandas as pd
import numpy as np

# For ML imputations
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression

# Sample data with missing values
data = {
    'Age': [25, np.nan, 30, 22, np.nan, 28, 35],
    'Salary': [50000, 60000, np.nan, 45000, 52000, 58000, np.nan],
    'Experience': [2, 5, 7, 1, 3, 4, 6]
}

df = pd.DataFrame(data)
print("Original Data with Missing Values:")
print(df)

# --- 4. SimpleImputer (mean) ---
imputer = SimpleImputer(strategy='mean')
df_simple_imputed = df.copy()
df_simple_imputed[['Age', 'Salary']] = imputer.fit_transform(df_simple_imputed[['Age', 'Salary']])
print("\nAfter SimpleImputer Mean Strategy:")
print(df_simple_imputed)

# --- 5. Regression-based imputation for Salary based on Age and Experience ---
# Separate rows with and without missing Salary
train_df = df.dropna(subset=['Salary'])  # rows with Salary
test_df = df[df['Salary'].isnull()]      # rows missing Salary

# Train regression model
reg = LinearRegression()
X_train = train_df[['Age', 'Experience']]
y_train = train_df['Salary']
reg.fit(X_train, y_train)

# Predict missing Salary
X_test = test_df[['Age', 'Experience']]

# Note: If Age is missing in test_df, those rows would need separate handling.
# Here we drop missing Age rows for simplicity
X_test = X_test.dropna()
predicted_salary = reg.predict(X_test)

# Fill the predicted values back
df_reg_imputed = df.copy()
df_reg_imputed.loc[X_test.index, 'Salary'] = predicted_salary

print("\nAfter Regression-based Imputation for Salary:")
print(df_reg_imputed)

# --- 6. KNN Imputer ---
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = df.copy()
df_knn_imputed[['Age', 'Salary', 'Experience']] = knn_imputer.fit_transform(df_knn_imputed[['Age', 'Salary', 'Experience']])
print("\nAfter KNN Imputer:")
print(df_knn_imputed)


Original Data with Missing Values:
    Age   Salary  Experience
0  25.0  50000.0           2
1   NaN  60000.0           5
2  30.0      NaN           7
3  22.0  45000.0           1
4   NaN  52000.0           3
5  28.0  58000.0           4
6  35.0      NaN           6

After SimpleImputer Mean Strategy:
    Age   Salary  Experience
0  25.0  50000.0           2
1  28.0  60000.0           5
2  30.0  53000.0           7
3  22.0  45000.0           1
4  28.0  52000.0           3
5  28.0  58000.0           4
6  35.0  53000.0           6


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values