In [3]:


import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

# Sample DataFrame (replace with your actual data loading)
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [6, np.nan, 8, 9, 10],
        'C': ['X', 'Y', 'X', np.nan, 'Y'],
        'D': [11, 12, 13, np.nan, 15]}
df = pd.DataFrame(data)

# 1. Introduction to Missing Data
print("Initial DataFrame:")
print(df)
print("\nMissing values:")
print(df.isnull())
print("\nSum of missing values per column:")
print(df.isnull().sum())

# 2. Dropping Rows with Missing Values
df_dropped_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped_rows)

# 3. Dropping Columns with Missing Values
df_dropped_cols = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:")
print(df_dropped_cols)


# 4. Mean Imputation for Numerical Data
df_mean_imputed = df.copy()
df_mean_imputed['A'] = df_mean_imputed['A'].fillna(df_mean_imputed['A'].mean())
print("\nDataFrame after mean imputation for column 'A':")
print(df_mean_imputed)


# 5. Mode Imputation for Categorical Data
df_mode_imputed = df.copy()
df_mode_imputed['C'] = df_mode_imputed['C'].fillna(df_mode_imputed['C'].mode()[0])
print("\nDataFrame after mode imputation for column 'C':")
print(df_mode_imputed)


# 6. Median Imputation for Skewed Data
df_median_imputed = df.copy()
df_median_imputed['B'] = df_median_imputed['B'].fillna(df_median_imputed['B'].median())
print("\nDataFrame after median imputation for column 'B':")
print(df_median_imputed)

# 7. KNN Imputation
imputer = KNNImputer(n_neighbors=2)  # You can adjust n_neighbors
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df[['A', 'B', 'D']]), columns=['A', 'B', 'D'])
# Concatenate with non-numerical columns
df_knn_imputed = pd.concat([df_knn_imputed, df['C']], axis=1)
print("\nDataFrame after KNN imputation for numerical columns:")
df_knn_imputed


Initial DataFrame:
     A     B    C     D
0  1.0   6.0    X  11.0
1  2.0   NaN    Y  12.0
2  NaN   8.0    X  13.0
3  4.0   9.0  NaN   NaN
4  5.0  10.0    Y  15.0

Missing values:
       A      B      C      D
0  False  False  False  False
1  False   True  False  False
2   True  False  False  False
3  False  False   True   True
4  False  False  False  False

Sum of missing values per column:
A    1
B    1
C    1
D    1
dtype: int64

DataFrame after dropping rows with missing values:
     A     B  C     D
0  1.0   6.0  X  11.0
4  5.0  10.0  Y  15.0

DataFrame after dropping columns with missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]

DataFrame after mean imputation for column 'A':
     A     B    C     D
0  1.0   6.0    X  11.0
1  2.0   NaN    Y  12.0
2  3.0   8.0    X  13.0
3  4.0   9.0  NaN   NaN
4  5.0  10.0    Y  15.0

DataFrame after mode imputation for column 'C':
     A     B  C     D
0  1.0   6.0  X  11.0
1  2.0   NaN  Y  12.0
2  NaN   8.0  X  13.0
3  4.0   9

Unnamed: 0,A,B,D,C
0,1.0,6.0,11.0,X
1,2.0,7.0,12.0,Y
2,3.0,8.0,13.0,X
3,4.0,9.0,14.0,
4,5.0,10.0,15.0,Y
