# Ausrüster AG use-case

## KNN Imputation in real life

Author: Dr. Stephan Hausberg, Winter semester 2024

1. Step: Generate Synthetic Data of customers including misssing data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(0)

# Generating synthetic customer data
num_customers = 100

data = {
    'Age': np.random.randint(18, 70, size=num_customers),                 # Age: 18 to 70
    'Income': np.random.randint(20000, 120000, size=num_customers),       # Income: $20,000 to $120,000
    'Gender': np.random.choice(['Male', 'Female'], size=num_customers),   # Gender: Male or Female
    'Purchase_Frequency': np.random.randint(1, 20, size=num_customers),   # Purchase Frequency: 1 to 20 times per year
    'Average_Purchase_Amount': np.random.uniform(50, 500, size=num_customers)  # Average Purchase Amount: $50 to $500
}

# Create DataFrame
df = pd.DataFrame(data)

# Introduce missing values randomly in 'Age' and 'Income' columns
missing_rate = 0.1  # 10% missing rate
num_missing = int(num_customers * missing_rate)

# Randomly set some ages and incomes to NaN
missing_age_indices = np.random.choice(df.index, num_missing, replace=False)
missing_income_indices = np.random.choice(df.index, num_missing, replace=False)
df.loc[missing_age_indices, 'Age'] = np.nan
df.loc[missing_income_indices, 'Income'] = np.nan

# Display the first few rows

In [None]:
print(df.head(10))

2. step: Imputation via KNN imputer

In [None]:
from sklearn.impute import KNNImputer

# Encode 'Gender' as a numeric value to use it in KNN imputation
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

# Initialize the KNN imputer
imputer = KNNImputer(n_neighbors=5)

# Apply imputation
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Re-convert 'Gender' column back to categorical values for clarity
df_imputed['Gender'] = df_imputed['Gender'].round().map({0: 'Male', 1: 'Female'})

# Display the imputed DataFrame
print(df_imputed.head(10))