In [13]:
import pandas as pd
import numpy as np

data = {
    'Customer ID': ['C-001', 'C-002', 'C-003', 'C-004', 'C-005'],
    'Name': ['John Doe', 'Jane Smith', np.nan, 'Mike Ross', 'Rachel Zane'],
    'Gender': ['M', 'F', 'F', 'M', np.nan],
    'Age': [28, 34, 29, np.nan, 24],
    'Income (USD)': [50000, np.nan, 60000, 52000, 49000],
    'Profession': ['Engineer', 'Doctor', 'Artist', 'Lawyer', np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


Original DataFrame:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0           NaN     Doctor
2       C-003          NaN      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M   NaN       52000.0     Lawyer
4       C-005  Rachel Zane    NaN  24.0       49000.0        NaN


Remove Rows with Any Null Values

In [14]:
df_dropna = df.dropna()
print("\nDataFrame after dropping rows with any null values:")
print(df_dropna)



DataFrame after dropping rows with any null values:
  Customer ID      Name Gender   Age  Income (USD) Profession
0       C-001  John Doe      M  28.0       50000.0   Engineer


Remove Columns with Any Null Values

In [15]:
df_dropna_columns = df.dropna(axis=1)
print("\nDataFrame after dropping columns with any null values:")
print(df_dropna_columns)



DataFrame after dropping columns with any null values:
  Customer ID
0       C-001
1       C-002
2       C-003
3       C-004
4       C-005


Impute with Constant Value

In [16]:
df_impute_constant = df.fillna({'Name': 'Unknown', 'Gender': 'Unknown', 'Profession': 'Unknown'})
print("\nDataFrame after imputing with a constant value:")
print(df_impute_constant)



DataFrame after imputing with a constant value:
  Customer ID         Name   Gender   Age  Income (USD) Profession
0       C-001     John Doe        M  28.0       50000.0   Engineer
1       C-002   Jane Smith        F  34.0           NaN     Doctor
2       C-003      Unknown        F  29.0       60000.0     Artist
3       C-004    Mike Ross        M   NaN       52000.0     Lawyer
4       C-005  Rachel Zane  Unknown  24.0       49000.0    Unknown


Impute with Mean Value

In [17]:
df_impute_mean = df.copy()
df_impute_mean['Age'].fillna(df_impute_mean['Age'].mean(), inplace=True)
df_impute_mean['Income (USD)'].fillna(df_impute_mean['Income (USD)'].mean(), inplace=True)
print("\nDataFrame after imputing with mean value:")
print(df_impute_mean)



DataFrame after imputing with mean value:
  Customer ID         Name Gender    Age  Income (USD) Profession
0       C-001     John Doe      M  28.00       50000.0   Engineer
1       C-002   Jane Smith      F  34.00       52750.0     Doctor
2       C-003          NaN      F  29.00       60000.0     Artist
3       C-004    Mike Ross      M  28.75       52000.0     Lawyer
4       C-005  Rachel Zane    NaN  24.00       49000.0        NaN


Impute with Median Value

In [18]:
df_impute_median = df.copy()
df_impute_median['Age'].fillna(df_impute_median['Age'].median(), inplace=True)
df_impute_median['Income (USD)'].fillna(df_impute_median['Income (USD)'].median(), inplace=True)
print("\nDataFrame after imputing with median value:")
print(df_impute_median)



DataFrame after imputing with median value:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0       51000.0     Doctor
2       C-003          NaN      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M  28.5       52000.0     Lawyer
4       C-005  Rachel Zane    NaN  24.0       49000.0        NaN


Impute with Mode Value

In [19]:
df_impute_mode = df.copy()
df_impute_mode['Gender'].fillna(df_impute_mode['Gender'].mode()[0], inplace=True)
print("\nDataFrame after imputing with mode value:")
print(df_impute_mode)



DataFrame after imputing with mode value:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0           NaN     Doctor
2       C-003          NaN      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M   NaN       52000.0     Lawyer
4       C-005  Rachel Zane      F  24.0       49000.0        NaN


Forward Fill

In [20]:
df_ffill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:")
print(df_ffill)



DataFrame after forward fill:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0       50000.0     Doctor
2       C-003   Jane Smith      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M  29.0       52000.0     Lawyer
4       C-005  Rachel Zane      M  24.0       49000.0     Lawyer


Backward Fill

In [21]:
df_bfill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:")
print(df_bfill)



DataFrame after backward fill:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0       60000.0     Doctor
2       C-003    Mike Ross      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M  24.0       52000.0     Lawyer
4       C-005  Rachel Zane    NaN  24.0       49000.0        NaN


Impute with KNN

In [22]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)

df_numeric = df[['Age', 'Income (USD)']]

df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)

df_knn = df.copy()
df_knn['Age'] = df_numeric_imputed['Age']
df_knn['Income (USD)'] = df_numeric_imputed['Income (USD)']

print("\nDataFrame after KNN imputation:")
print(df_knn)



DataFrame after KNN imputation:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0       55000.0     Doctor
2       C-003          NaN      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M  26.0       52000.0     Lawyer
4       C-005  Rachel Zane    NaN  24.0       49000.0        NaN


Impute Categorical Data with Most Frequent Value and Numerical Data with KNN

In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_encoded = df.copy()
df_encoded['Gender'] = df_encoded['Gender'].astype(str)
df_encoded['Gender'] = label_encoder.fit_transform(df_encoded['Gender'])

df_encoded_numeric = df_encoded[['Age', 'Income (USD)']]
df_encoded_numeric_imputed = pd.DataFrame(imputer.fit_transform(df_encoded_numeric), columns=df_encoded_numeric.columns)

df_combined = df_encoded.copy()
df_combined['Age'] = df_encoded_numeric_imputed['Age']
df_combined['Income (USD)'] = df_encoded_numeric_imputed['Income (USD)']

df_combined['Gender'] = df_combined['Gender'].replace(label_encoder.transform([label_encoder.classes_[-1]])[0], np.nan)
df_combined['Gender'].fillna(df_combined['Gender'].mode()[0], inplace=True)

df_combined['Gender'] = label_encoder.inverse_transform(df_combined['Gender'].astype(int))

print("\nDataFrame after combined KNN imputation for numerical and mode for categorical columns:")
print(df_combined)



DataFrame after combined KNN imputation for numerical and mode for categorical columns:
  Customer ID         Name Gender   Age  Income (USD) Profession
0       C-001     John Doe      M  28.0       50000.0   Engineer
1       C-002   Jane Smith      F  34.0       55000.0     Doctor
2       C-003          NaN      F  29.0       60000.0     Artist
3       C-004    Mike Ross      M  26.0       52000.0     Lawyer
4       C-005  Rachel Zane      F  24.0       49000.0        NaN
