In [1]:
# Question: Advanced Data Cleaning with Multiple Issues
# Objective: Handle multiple issues in one dataset, including missing values, duplicates, and outliers.
# Description: Given a dataset with various data quality issues, employ multiple data cleaning techniques.




In [2]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.



In [3]:
# Question: Feature Engineering by Creating New Features
# Objective: Create a new feature based on existing features to add predictive power.
# Description: Generate additional features from existing data to potentially improve the performance of
# prediction models.




In [4]:
# Question: Handling Complex Outliers with Z-Scores
# Objective: Detect and handle outliers using Z-score method.
# Description: Use the Z-score method to identify outliers which significantly differ from the rest of the data points.




In [5]:
# Question: Data Imputation with K-Nearest Neighbors (KNN)
# Objective: Impute missing numerical values using the KNN method.
# Description: Use the K-nearest neighbors algorithm to fill in missing values, which considers the values of
# nearest neighbors for imputation.




In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from scipy.stats import zscore

# ------------------------------------------------
# Step 1: Create Synthetic Dataset with Issues
# ------------------------------------------------
np.random.seed(0)

data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 5, 6, 7],
    'Income': [50000, 55000, np.nan, 1200000, 52000, 52000, 58000, 61000],  # Outlier and missing
    'Age': [25, np.nan, 30, 28, 27, 27, 1000, 26],  # Outlier and missing
    'Expenses': [2000, 2100, 1900, 2500, np.nan, np.nan, 2200, 2300],  # Missing
    'Gender': ['M', 'F', 'F', 'M', 'F', 'F', 'M', 'F']
})

print("=== Original Data ===")
print(data)

# ------------------------------------------------
# Step 2: Remove Duplicates
# ------------------------------------------------
data = data.drop_duplicates()
print("\n=== After Removing Duplicates ===")
print(data)

# ------------------------------------------------
# Step 3: Handle Outliers Using Z-Score
# ------------------------------------------------
# Only apply Z-score to numeric columns
numeric_cols = ['Income', 'Age', 'Expenses']
z_scores = np.abs(zscore(data[numeric_cols], nan_policy='omit'))

# Define threshold (e.g., Z > 3 considered outlier)
outlier_mask = (z_scores < 3).all(axis=1)
data = data[outlier_mask]

print("\n=== After Removing Outliers with Z-Score ===")
print(data)

# ------------------------------------------------
# Step 4: Impute Missing Values with KNN
# ------------------------------------------------
# KNN Imputation on numeric columns
knn_imputer = KNNImputer(n_neighbors=2)
data[numeric_cols] = knn_imputer.fit_transform(data[numeric_cols])

print("\n=== After KNN Imputation ===")
print(data)

# ------------------------------------------------
# Step 5: Apply Log Transformation (e.g., Income, Expenses)
# ------------------------------------------------
data['Log_Income'] = np.log1p(data['Income'])  # log1p = log(x+1)
data['Log_Expenses'] = np.log1p(data['Expenses'])

print("\n=== After Log Transformation ===")
print(data[['Income', 'Log_Income', 'Expenses', 'Log_Expenses']])

# ------------------------------------------------
# Step 6: Feature Engineering (e.g., Savings = Income - Expenses)
# ------------------------------------------------
data['Savings'] = data['Income'] - data['Expenses']

print("\n=== After Feature Engineering (Savings) ===")
print(data[['Income', 'Expenses', 'Savings']])

# ------------------------------------------------
# Final Cleaned Dataset
# ------------------------------------------------
print("\n=== Final Cleaned & Transformed Dataset ===")
print(data)


=== Original Data ===
   ID     Income     Age  Expenses Gender
0   1    50000.0    25.0    2000.0      M
1   2    55000.0     NaN    2100.0      F
2   3        NaN    30.0    1900.0      F
3   4  1200000.0    28.0    2500.0      M
4   5    52000.0    27.0       NaN      F
5   5    52000.0    27.0       NaN      F
6   6    58000.0  1000.0    2200.0      M
7   7    61000.0    26.0    2300.0      F

=== After Removing Duplicates ===
   ID     Income     Age  Expenses Gender
0   1    50000.0    25.0    2000.0      M
1   2    55000.0     NaN    2100.0      F
2   3        NaN    30.0    1900.0      F
3   4  1200000.0    28.0    2500.0      M
4   5    52000.0    27.0       NaN      F
6   6    58000.0  1000.0    2200.0      M
7   7    61000.0    26.0    2300.0      F

=== After Removing Outliers with Z-Score ===
   ID     Income     Age  Expenses Gender
0   1    50000.0    25.0    2000.0      M
3   4  1200000.0    28.0    2500.0      M
6   6    58000.0  1000.0    2200.0      M
7   7    61000.