In [4]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.







In [5]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.






In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression

# Sample dataset with missing values
data = {
    'Feature1': [1.0, 2.0, 3.0, None, 5.0],
    'Feature2': [2.5, None, 2.1, 3.3, None],
    'Target': [10, 15, 14, 13, 15]
}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# 4. ML-based Imputation with SimpleImputer (mean strategy)
simple_imputer = SimpleImputer(strategy='mean')
df_simple_imputed = pd.DataFrame(simple_imputer.fit_transform(df), columns=df.columns)
print("\nSimpleImputer Result:")
print(df_simple_imputed)


# 5. Imputation using a Regression Model
# Impute missing values in 'Feature1' using 'Feature2' and 'Target'

# Split data into rows with and without missing 'Feature1'
df_complete = df[df['Feature1'].notnull()]
df_missing = df[df['Feature1'].isnull()]

predictor_cols = ['Feature2', 'Target']

# Drop rows with missing predictors or target from training data
df_train = df_complete.dropna(subset=predictor_cols + ['Feature1'])

X_train = df_train[predictor_cols]
y_train = df_train['Feature1']

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# For rows missing 'Feature1', only predict if predictor data is complete (no NaNs)
X_missing = df_missing[predictor_cols].dropna()

predicted_values = regressor.predict(X_missing)

# Fill the missing 'Feature1' values where prediction was possible
df.loc[X_missing.index, 'Feature1'] = predicted_values

print("\nAfter Regression Imputation:")
print(df)


# 6. K-Nearest Neighbors Imputation
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

print("\nKNN Imputer Result:")
print(df_knn_imputed)


Original Data:
   Feature1  Feature2  Target
0       1.0       2.5      10
1       2.0       NaN      15
2       3.0       2.1      14
3       NaN       3.3      13
4       5.0       NaN      15

SimpleImputer Result:
   Feature1  Feature2  Target
0      1.00  2.500000    10.0
1      2.00  2.633333    15.0
2      3.00  2.100000    14.0
3      2.75  3.300000    13.0
4      5.00  2.633333    15.0

After Regression Imputation:
   Feature1  Feature2  Target
0  1.000000       2.5      10
1  2.000000       NaN      15
2  3.000000       2.1      14
3  2.445545       3.3      13
4  5.000000       NaN      15

KNN Imputer Result:
   Feature1  Feature2  Target
0  1.000000       2.5    10.0
1  2.000000       2.7    15.0
2  3.000000       2.1    14.0
3  2.445545       3.3    13.0
4  5.000000       2.7    15.0
