# **Preparation Notebook**



---
## Setup Environment

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
!pip install -q utstd

from utstd.folders import *
from utstd.ipyrenders import *

at = AtFolder(
    course_code=36106,
    assignment="AT3",
)
at.run()

import warnings
warnings.simplefilter(action='ignore')

---
## Student Information

In [None]:
# <Student to fill this section and then remove this comment>
group_name = "AT3-group 12"
student_name = "CEWANG"
student_id = "25687207"

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='group_name', value=group_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_name', value=student_name)

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h1", key='student_id', value=student_id)

---
## 0. Python Packages

### 0.a Install Additional Packages

> If you are using additional packages, you need to install them here using the command: `! pip install <package_name>`

In [None]:
!pip install -q scikit-learn matplotlib seaborn

### 0.b Import Packages

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
import pandas as pd
import altair as alt

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

---
## A. Feature Selection


## A.0 Load Data

In [None]:
try:
    customers_cleaned = pd.read_csv(at.folder_path / "data" / "customers_cleaned.csv")
    product_cats_cleaned = pd.read_csv(at.folder_path / "data" / "product_cats_cleaned.csv")
    print("Loaded cleaned datasets successfully")
    print(f"Customers shape: {customers_cleaned.shape}")
    print(f"Product categories shape: {product_cats_cleaned.shape}")
except Exception as e:
    print(f"Error loading cleaned datasets: {e}")

    customers_cleaned = customers_df
    product_cats_cleaned = product_cats_df

### A.1 Approach 1

In [None]:
print("=== Business-Driven Feature Selection ===\n")

demographic_features = ['age', 'marital_status', 'number_dependents']
socioeconomic_features = ['annual_income', 'education_level', 'occupation', 'homeowner']
behavioral_features = []
engineered_features = ['age_group', 'income_category', 'lifestyle_segment', 'family_status', 'customer_value_score', 'value_segment']

print("Key Feature Categories for Customer Segmentation:")
print(f"1. Demographic Features: {demographic_features}")
print(f"2. Socioeconomic Features: {socioeconomic_features}")
print(f"3. Engineered Features: {engineered_features}")

print("\nCorrelation Analysis of Numerical Features:")
numerical_features = customers_cleaned.select_dtypes(include=['number']).columns
correlation_matrix = customers_cleaned[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

print("High correlation (>0.7) pairs:")
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j],
                                  correlation_matrix.iloc[i, j]))
            print(f"  {correlation_matrix.columns[i]} vs {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

In [None]:
feature_selection_1_insights = """
Approach: Business-Driven Feature Selection
Rationale: For customer segmentation, we prioritize features that directly describe customer characteristics, behaviors, and value potential.

Key Insights from Correlation Analysis:
- Identified highly correlated feature pairs that may cause multicollinearity
- Customer value score shows moderate correlation with income and education scores (as expected)
- Age shows low correlation with other features, making it valuable for segmentation

Selected Feature Categories:
1. Demographic: Age, marital status, dependents (capture life stage)
2. Socioeconomic: Income, education, occupation, homeownership (reflect purchasing power)
3. Engineered: Lifestyle segments, value scores (provide business context)

Business Justification: These features enable identification of meaningful customer segments like "Affluent Professionals", "Budget-Conscious Families", etc., which are actionable for marketing strategies.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_selection_1_insights', value=feature_selection_1_insights)

### A.2 Approach 2

In [None]:
print("=== Multicollinearity Handling and Final Feature Selection ===\n")

features_to_remove = [
    'income_score',
    'age_score',
    'education_score', 'homeowner_score', 'occupation_score'
]

print("Features removed due to high correlation:")
for feature in features_to_remove:
    print(f"  - {feature}")

final_features = [

    'age', 'marital_status', 'number_dependents',

    'annual_income', 'education_level', 'occupation', 'homeowner',

    'age_group', 'income_category', 'lifestyle_segment', 'family_status', 'value_segment'
]

print(f"\nFinal selected features ({len(final_features)}):")
for i, feature in enumerate(final_features, 1):
    print(f"  {i:2d}. {feature}")

X_selected = customers_cleaned[final_features].copy()

print(f"\nSelected dataset shape: {X_selected.shape}")
print("\nData types of selected features:")
print(X_selected.dtypes.value_counts())

print("\nMissing values in selected features:")
missing_values = X_selected.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
feature_selection_2_insights = """
Approach: Multicollinearity Handling and Final Feature Selection
Rationale: Remove redundant features to improve clustering performance and interpretability.

Key Actions:
- Removed derived score features that showed perfect correlation with original features
- Kept original categorical features (education_level, occupation, homeowner) for better interpretability
- Retained engineered business segments that provide actionable insights

Final Feature Set (13 features):
- Demographic: age, marital_status, number_dependents
- Socioeconomic: annual_income, education_level, occupation, homeowner
- Engineered: age_group, income_category, lifestyle_segment, family_status, value_segment

Benefits:
- Eliminates multicollinearity issues
- Maintains business interpretability
- Provides balanced representation of customer characteristics
- Ready for encoding and scaling for clustering algorithms

Dataset ready: 19,963 customers × 13 features with no missing values
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_selection_2_insights', value=feature_selection_2_insights)

### A.n Approach "\<describe_approach_here\>"

> You can add more cells related to other approaches in this section

### A.z Final Selection of Features

In [None]:
features_list = [
    'age', 'marital_status', 'number_dependents',
    'annual_income', 'education_level', 'occupation', 'homeowner',
    'age_group', 'income_category', 'lifestyle_segment', 'family_status', 'value_segment'
]

print("=== FINAL FEATURE SELECTION FOR CUSTOMER CLUSTERING ===\n")
print(f"Total features selected: {len(features_list)}")
print(f"Dataset shape: {X_selected.shape}")

print("\nFeature Categories:")
print("1. Demographic (3): age, marital_status, number_dependents")
print("2. Socioeconomic (4): annual_income, education_level, occupation, homeowner")
print("3. Engineered Segments (5): age_group, income_category, lifestyle_segment, family_status, value_segment")

print("\nData Type Distribution:")
print(X_selected.dtypes.value_counts())

print("\nSample of selected data:")
display(X_selected.head(3))

In [None]:
feature_selection_explanations = """
Final Feature Selection for Customer Clustering:

Selected 12 features across 3 categories:

1. DEMOGRAPHIC (Life Stage):
   - age: Customer age (numerical)
   - marital_status: Marital status (categorical)
   - number_dependents: Number of dependents (numerical)

2. SOCIOECONOMIC (Purchasing Power):
   - annual_income: Yearly income (numerical)
   - education_level: Education attainment (ordinal categorical)
   - occupation: Job type (categorical)
   - homeowner: Homeownership status (binary categorical)

3. ENGINEERED SEGMENTS (Business Context):
   - age_group: Age categories (categorical)
   - income_category: Income brackets (categorical)
   - lifestyle_segment: Combined lifestyle classification
   - family_status: Family structure
   - value_segment: Customer value tier

Rationale:
- Balanced mix of numerical and categorical features
- Eliminated multicollinearity from derived scores
- Maintained business interpretability
- All features have no missing values
- Ready for data transformation and clustering

This feature set enables identification of meaningful customer segments for targeted marketing strategies.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_selection_explanations', value=feature_selection_explanations)

---
## B. Data Cleaning

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
# Load datasets
try:
  sales_2022_df = pd.read_csv(at.folder_path / "sales_2022.csv")
  products_df = pd.read_csv(at.folder_path / "products.csv")
  product_subcats_df = pd.read_csv(at.folder_path / "product_subcats.csv")
  territories_df = pd.read_csv(at.folder_path / "territories.csv")
  sales_2021_df = pd.read_csv(at.folder_path / "sales_2021.csv")
  returns_df = pd.read_csv(at.folder_path / "returns.csv")
  sales_2020_df = pd.read_csv(at.folder_path / "sales_2020.csv")
  product_cats_df = pd.read_csv(at.folder_path / "product_cats.csv")
  customers_df = pd.read_csv(at.folder_path / "customers.csv")
except Exception as e:
  print(e)

### B.1 Fixing "Data type conversion and encoding preparation"

In [None]:
print("=== Data Type Preparation for Clustering ===\n")

df_clean = X_selected.copy()

print("Current data types:")
print(df_clean.dtypes)

categorical_columns = ['marital_status', 'education_level', 'occupation', 'homeowner',
                      'age_group', 'income_category', 'lifestyle_segment', 'family_status', 'value_segment']

for col in categorical_columns:
    df_clean[col] = df_clean[col].astype('category')

print(f"\nAfter conversion - Categorical columns: {len(categorical_columns)}")
print(f"Numerical columns: {len(df_clean.select_dtypes(include=['number']).columns)}")

print("\nData types after conversion:")
print(df_clean.dtypes.value_counts())

print(f"\nCleaned dataset shape: {df_clean.shape}")
print("Sample of cleaned data:")
display(df_clean.head(2))

In [None]:
data_cleaning_1_explanations = """
Issue: Data type standardization for clustering algorithms
Action: Converted categorical columns to proper 'category' data type

Reason:
- Clustering algorithms require consistent data types
- Categorical data type improves memory efficiency and processing speed
- Enables proper encoding in subsequent steps
- Facilitates one-hot encoding for categorical variables

Impact:
- Reduced memory usage
- Improved data processing performance
- Prepared data for proper encoding techniques
- Maintained data integrity for clustering algorithms

Business Justification: Proper data typing ensures accurate distance calculations in clustering algorithms, leading to more meaningful customer segments.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_cleaning_1_explanations', value=data_cleaning_1_explanations)

### B.2 Fixing "Handling categorical variables for clustering"

In [None]:
print("=== Categorical Variables Analysis for Encoding ===\n")

categorical_columns = ['marital_status', 'education_level', 'occupation', 'homeowner',
                      'age_group', 'income_category', 'lifestyle_segment', 'family_status', 'value_segment']

print("Categorical variables cardinality:")
for col in categorical_columns:
    unique_count = df_clean[col].nunique()
    unique_values = df_clean[col].unique()
    print(f"  {col:20}: {unique_count:2} categories - {list(unique_values)}")

print("\nCategorical Variable Types:")
ordinal_vars = {
    'education_level': ['Partial High School', 'High School', 'Partial College', 'Bachelors', 'Graduate Degree'],
    'income_category': ['Low Income', 'Middle Income', 'Upper Middle Income', 'High Income'],
    'value_segment': ['Standard Value', 'Medium Value', 'High Value', 'Premium'],
    'age_group': ['Young Adult', 'Adult', 'Middle Age', 'Senior', 'Elderly']
}

nominal_vars = ['marital_status', 'occupation', 'homeowner', 'lifestyle_segment', 'family_status']

print("Ordinal variables (inherent order):")
for var in ordinal_vars:
    print(f"  - {var}: {ordinal_vars[var]}")

print("\nNominal variables (no inherent order):")
for var in nominal_vars:
    print(f"  - {var}: {df_clean[var].unique().tolist()}")

print(f"\nTotal: {len(ordinal_vars)} ordinal, {len(nominal_vars)} nominal categorical variables")

In [None]:
data_cleaning_2_explanations = """
Issue: Categorical variable analysis for appropriate encoding
Action: Analyzed and classified categorical variables as ordinal vs nominal

Analysis Results:
- 4 Ordinal variables with inherent order:
  * education_level (5 levels: Partial High School → Graduate Degree)
  * income_category (4 levels: Low → High Income)
  * value_segment (4 levels: Standard → Premium)
  * age_group (5 levels: Young Adult → Elderly)

- 5 Nominal variables without inherent order:
  * marital_status (M, S)
  * occupation (5 categories)
  * homeowner (Y, N)
  * lifestyle_segment (5 categories)
  * family_status (4 categories)

Encoding Strategy:
- Ordinal variables: Use label encoding to preserve order information
- Nominal variables: Use one-hot encoding to avoid artificial ordering

Impact: Proper encoding preserves the semantic meaning of categorical data and improves clustering quality by maintaining appropriate distance relationships between categories.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_cleaning_2_explanations', value=data_cleaning_2_explanations)

### B.3 Fixing "Data consistency checks"

In [None]:
print("=== Data Consistency and Validation Checks ===\n")

print("1. Age vs Age Group Consistency:")
age_group_mapping = {
    'Young Adult': (18, 34),
    'Adult': (35, 49),
    'Middle Age': (50, 64),
    'Senior': (65, 79),
    'Elderly': (80, 120)
}

inconsistent_age_groups = []
for idx, row in df_clean.iterrows():
    age = row['age']
    age_group = row['age_group']
    expected_range = age_group_mapping.get(age_group, (0, 0))
    if not (expected_range[0] <= age <= expected_range[1]):
        inconsistent_age_groups.append((idx, age, age_group, expected_range))

if inconsistent_age_groups:
    print(f"  Found {len(inconsistent_age_groups)} inconsistent age-group pairs")
    print("  Sample inconsistencies:")
    for idx, age, age_group, expected in inconsistent_age_groups[:5]:
        print(f"    Row {idx}: Age={age}, Group='{age_group}', Expected range={expected}")
else:
    print("  ✓ All age-group pairs are consistent")

print("\n2. Income vs Income Category Consistency:")
income_category_ranges = {
    'Low Income': (10000, 29999),
    'Middle Income': (30000, 59999),
    'Upper Middle Income': (60000, 99999),
    'High Income': (100000, 170000)
}

inconsistent_incomes = []
for idx, row in df_clean.iterrows():
    income = row['annual_income']
    income_cat = row['income_category']
    expected_range = income_category_ranges.get(income_cat, (0, 0))
    if not (expected_range[0] <= income <= expected_range[1]):
        inconsistent_incomes.append((idx, income, income_cat, expected_range))

if inconsistent_incomes:
    print(f"  Found {len(inconsistent_incomes)} inconsistent income-category pairs")
    print("  Sample inconsistencies:")
    for idx, income, income_cat, expected in inconsistent_incomes[:5]:
        print(f"    Row {idx}: Income={income}, Category='{income_cat}', Expected range={expected}")
else:
    print("  ✓ All income-category pairs are consistent")

print("\n3. Family Status vs Dependents/Marital Status Consistency:")
inconsistent_family = []
for idx, row in df_clean.iterrows():
    family_status = row['family_status']
    marital_status = row['marital_status']
    dependents = row['number_dependents']

    if family_status == 'Single No Kids' and (marital_status != 'S' or dependents != 0):
        inconsistent_family.append((idx, family_status, marital_status, dependents))
    elif family_status == 'Single Parent' and (marital_status != 'S' or dependents == 0):
        inconsistent_family.append((idx, family_status, marital_status, dependents))
    elif family_status == 'Married No Kids' and (marital_status != 'M' or dependents != 0):
        inconsistent_family.append((idx, family_status, marital_status, dependents))
    elif family_status == 'Married With Kids' and (marital_status != 'M' or dependents == 0):
        inconsistent_family.append((idx, family_status, marital_status, dependents))

if inconsistent_family:
    print(f"  Found {len(inconsistent_family)} inconsistent family status pairs")
    print("  Sample inconsistencies:")
    for idx, family, marital, deps in inconsistent_family[:5]:
        print(f"    Row {idx}: Family='{family}', Marital='{marital}', Dependents={deps}")
else:
    print("  ✓ All family status pairs are consistent")

print(f"\nOverall data quality: {df_clean.shape[0] - len(inconsistent_age_groups) - len(inconsistent_incomes) - len(inconsistent_family)}/{df_clean.shape[0]} rows consistent")

print("\n4. Data Type Preparation for Clustering:")

print("Current data types:")
print(df_clean.dtypes)

categorical_columns = ['marital_status', 'education_level', 'occupation', 'homeowner',
                      'age_group', 'income_category', 'lifestyle_segment', 'family_status', 'value_segment']

for col in categorical_columns:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')
        print(f"  - {col}: converted to category")

numerical_columns = ['age', 'number_dependents', 'annual_income']
for col in numerical_columns:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
        print(f"  - {col}: ensured numeric type")

print(f"\nFinal data types after conversion:")
print(df_clean.dtypes.value_counts())
print(f"Dataset shape: {df_clean.shape}")

print("\nSample of cleaned data:")
display(df_clean.head(2))

In [None]:
data_cleaning_3_explanations = """
Issue: Data consistency validation and type preparation
Action:
- Performed cross-validation checks between related features
- Converted categorical variables to efficient 'category' data type
- Ensured numerical variables are properly typed

Validation Results:
- All age-group pairs consistent ✓
- All income-category pairs consistent ✓
- All family status pairs consistent ✓
- 100% data consistency achieved

Data Type Preparation:
- 9 categorical variables converted to 'category' type for efficiency
- 3 numerical variables ensured to be proper numeric types
- All data now properly typed for subsequent encoding and clustering

Importance: Consistent and properly typed data ensures accurate feature encoding and reliable clustering results. Data consistency checks validate that engineered features logically align with their source data.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_cleaning_3_explanations', value=data_cleaning_3_explanations)

### B.n Fixing "\<describe_issue_here\>"

> You can add more cells related to other issues in this section

In [None]:
# <Student to fill this section and then remove this comment>

In [None]:
# <Student to fill this section and then remove this comment>
data_cleaning_n_explanations = """
Provide some explanations on why you believe it is important to fix this issue and its impacts
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_cleaning_n_explanations', value=data_cleaning_n_explanations)

---
## C. Feature Engineering

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
# Create copy of datasets

try:
  training_df_clean = df_clean.copy()
  validation_df_clean = df_clean.copy()
  testing_df_clean = df_clean.copy()
  training_df_eng = training_df_clean.copy()
  validation_df_eng = validation_df_clean.copy()
  testing_df_eng = testing_df_clean.copy()
except Exception as e:
  print(e)

### C.1 New Feature "Encoded categorical variables"



In [None]:
print("=== Categorical Variables Encoding ===\n")

df_eng = df_clean.copy()

ordinal_mappings = {
    'education_level': {
        'Partial High School': 1,
        'High School': 2,
        'Partial College': 3,
        'Bachelors': 4,
        'Graduate Degree': 5
    },
    'income_category': {
        'Low Income': 1,
        'Middle Income': 2,
        'Upper Middle Income': 3,
        'High Income': 4
    },
    'value_segment': {
        'Standard Value': 1,
        'Medium Value': 2,
        'High Value': 3,
        'Premium': 4
    },
    'age_group': {
        'Young Adult': 1,
        'Adult': 2,
        'Middle Age': 3,
        'Senior': 4,
        'Elderly': 5
    }
}

print("Applying ordinal encoding to categorical variables:")
for col, mapping in ordinal_mappings.items():
    df_eng[f'{col}_encoded'] = df_eng[col].map(mapping)
    print(f"  - {col} → {col}_encoded: {mapping}")

nominal_vars = ['marital_status', 'occupation', 'homeowner', 'lifestyle_segment', 'family_status']

print(f"\nNominal variables ready for one-hot encoding: {nominal_vars}")
print(f"Total features after ordinal encoding: {df_eng.shape[1]}")

print("\nSample of encoded data:")
encoded_sample = df_eng[['education_level', 'education_level_encoded',
                        'income_category', 'income_category_encoded',
                        'value_segment', 'value_segment_encoded']].head(3)
display(encoded_sample)

In [None]:
feature_engineering_1_explanations = """
New Feature: Encoded categorical variables for ordinal features
Action: Applied label encoding to preserve ordinal relationships in categorical variables

Encoded Variables:
- education_level_encoded: Maps education levels from 1 (Partial High School) to 5 (Graduate Degree)
- income_category_encoded: Maps income categories from 1 (Low Income) to 4 (High Income)
- value_segment_encoded: Maps value segments from 1 (Standard Value) to 4 (Premium)
- age_group_encoded: Maps age groups from 1 (Young Adult) to 5 (Elderly)

Rationale:
- Preserves the inherent order information in ordinal categorical variables
- Enables clustering algorithms to understand hierarchical relationships
- Maintains the semantic meaning of ordered categories
- More efficient than one-hot encoding for ordinal data

Impact: Clustering algorithms can now properly interpret the progression from low to high education, income, value, and age groups, leading to more meaningful segment boundaries.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_engineering_1_explanations', value=feature_engineering_1_explanations)

### C.2 New Feature "One-hot encoded nominal variables"



In [None]:
print("=== One-Hot Encoding for Nominal Variables ===\n")

nominal_vars = ['marital_status', 'occupation', 'homeowner', 'lifestyle_segment', 'family_status']

print("Applying one-hot encoding to nominal variables:")
df_encoded = pd.get_dummies(df_eng, columns=nominal_vars, prefix=nominal_vars, drop_first=True)

print(f"Original features: {df_eng.shape[1]}")
print(f"After one-hot encoding: {df_encoded.shape[1]}")

one_hot_columns = [col for col in df_encoded.columns if any(nom in col for nom in nominal_vars)]
print(f"\nGenerated {len(one_hot_columns)} one-hot encoded columns:")
for i, col in enumerate(one_hot_columns[:10], 1):
    print(f"  {i:2d}. {col}")
if len(one_hot_columns) > 10:
    print(f"  ... and {len(one_hot_columns) - 10} more")

print(f"\nFinal encoded dataset shape: {df_encoded.shape}")
print("\nSample of one-hot encoded data (showing first 5 encoded columns):")
sample_encoded = df_encoded[one_hot_columns[:5]].head(3)
display(sample_encoded)

In [None]:
feature_engineering_2_explanations = """
New Feature: One-hot encoded nominal variables
Action: Applied one-hot encoding to nominal categorical variables without inherent order

Encoded Variables:
- marital_status: Converted to binary columns (drop_first=True)
- occupation: 5 categories → 4 binary columns
- homeowner: 2 categories → 1 binary column
- lifestyle_segment: 5 categories → 4 binary columns
- family_status: 4 categories → 3 binary columns

Rationale:
- Prevents clustering algorithms from imposing artificial order on nominal categories
- Each category gets equal weight in distance calculations
- Avoids the "distance" misconception between unrelated categories
- drop_first=True removes redundancy and reduces multicollinearity

Impact:
- Expanded feature space from 16 to [number] features
- Enabled proper representation of categorical relationships
- Prepared data for distance-based clustering algorithms like K-Means
- Maintained interpretability through meaningful column names
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_engineering_2_explanations', value=feature_engineering_2_explanations)

### C.4 New Feature "\<put_name_here\>"



In [None]:
print("Feature scaling will be applied in the data transformation section")
print("This ensures all features are on comparable scales for clustering algorithms")

In [None]:
feature_engineering_3_explanations = """
Note: Feature Scaling Preparation
Purpose: Acknowledging that feature scaling is essential for clustering algorithms

Rationale:
- Clustering algorithms like K-Means are distance-based
- Features with larger scales can dominate the distance calculations
- Standardization/Normalization ensures all features contribute equally

Next Steps:
- Scaling will be applied in Section D (Data Transformation)
- This separation maintains clean workflow organization
- Allows flexibility to test different scaling approaches
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_engineering_3_explanations', value=feature_engineering_3_explanations)

### C.n Fixing "\<describe_issue_here\>"

> You can add more cells related to new features in this section

In [None]:
# <Student to fill this section and then remove this comment>

In [None]:
# <Student to fill this section and then remove this comment>
feature_engineering_n_explanations = """
Provide some explanations on why you believe it is important to create this feature and its impacts
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='feature_engineering_n_explanations', value=feature_engineering_n_explanations)

---
## D. Data Preparation for Modeling

### D.1 Split Datasets


In [None]:
print("=== Final Dataset Preparation for Clustering ===\n")


clustering_features = [col for col in df_encoded.columns if col not in [
    'education_level', 'income_category', 'value_segment', 'age_group',
    'marital_status', 'occupation', 'homeowner', 'lifestyle_segment', 'family_status'
]]

X_final = df_encoded[clustering_features]

print(f"Final feature set: {len(clustering_features)} features")
print(f"Final dataset shape: {X_final.shape}")

numerical_features = ['age', 'number_dependents', 'annual_income']
ordinal_encoded = [col for col in clustering_features if 'encoded' in col]
one_hot_encoded = [col for col in clustering_features if col not in numerical_features and col not in ordinal_encoded]

print(f"\nFeature Categories:")
print(f"- Numerical features: {len(numerical_features)}")
print(f"- Ordinal encoded: {len(ordinal_encoded)}")
print(f"- One-hot encoded: {len(one_hot_encoded)}")

from sklearn.model_selection import train_test_split

X_temp, X_test = train_test_split(X_final, test_size=0.1, random_state=42)

X_train, X_val = train_test_split(X_temp, test_size=0.111, random_state=42)

print(f"\nData splits created:")
print(f"Training set: {X_train.shape} ({len(X_train)/len(X_final)*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({len(X_val)/len(X_final)*100:.1f}%)")
print(f"Testing set: {X_test.shape} ({len(X_test)/len(X_final)*100:.1f}%)")

training_df_eng = X_train.copy()
validation_df_eng = X_val.copy()
testing_df_eng = X_test.copy()

print("\nGlobal dataset variables updated successfully")

In [None]:
data_splitting_explanations = """
Data Splitting Strategy for Customer Clustering:

Split Ratio: 80% Training - 10% Validation - 10% Testing

Rationale for Clustering:
- Training Set (80%): Used for developing clustering models, determining optimal number of clusters (elbow method, silhouette analysis)
- Validation Set (10%): Used for validating cluster stability and consistency across different data samples
- Testing Set (10%): Reserved for final evaluation to ensure clusters generalize well to unseen data

Feature Composition:
- 21 total features ready for clustering
- All features are numerical after comprehensive encoding
- Mix of original numerical, ordinal encoded, and one-hot encoded variables

Clustering Approach:
While clustering typically uses entire datasets, this split allows for robust validation of cluster quality and stability, ensuring our customer segments are meaningful and reproducible.
"""

In [None]:
# Do not modify this code
print_tile(size="h3", key='data_splitting_explanations', value=data_splitting_explanations)

### D.2 Data Transformation <put_name_here>


In [None]:
print("=== Feature Scaling for Clustering ===\n")

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
print("Fitting StandardScaler on training data...")
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("Scaling completed successfully!")
print(f"Scaled training set: {X_train_scaled_df.shape}")
print(f"Scaled validation set: {X_val_scaled_df.shape}")
print(f"Scaled testing set: {X_test_scaled_df.shape}")

print("\nScaling verification:")
print(f"Training mean: {X_train_scaled_df.mean().mean():.6f} (should be ~0)")
print(f"Training std: {X_train_scaled_df.std().mean():.6f} (should be ~1)")

X_train_final = X_train_scaled_df
X_val_final = X_val_scaled_df
X_test_final = X_test_scaled_df

print("\n✅ All datasets scaled and ready for clustering algorithms!")

In [None]:
data_transformation_1_explanations = """
Transformation: StandardScaler (Z-score Normalization)

Action Applied:
- Fitted StandardScaler exclusively on training data (15971 samples)
- Transformed training, validation, and testing sets using the fitted scaler
- All 20 features were standardized to have mean = 0 and standard deviation = 1

Verification Results:
- Training set mean: 0.000000 (perfectly centered)
- Training set standard deviation: 1.000031 (very close to ideal 1.0)
- Validation and testing sets transformed using training set parameters

Why StandardScaler for Clustering:
1. Distance-Based Algorithms: Clustering methods like K-Means rely on Euclidean distance calculations
2. Scale Sensitivity: Features with larger ranges (e.g., annual_income: 10,000-170,000) would dominate over smaller-range features (e.g., binary encoded variables: 0-1)
3. Equal Contribution: Standardization ensures all features contribute equally to cluster formation
4. Algorithm Performance: Prevents bias towards high-magnitude features, leading to more balanced and meaningful clusters

Data Leakage Prevention:
- Scaler was fitted ONLY on training data
- Validation and test sets were transformed using training set parameters
- This maintains the integrity of our evaluation process

Impact on Clustering:
- Enables fair distance comparisons across all feature types
- Improves cluster quality and interpretability
- Facilitates better convergence of clustering algorithms
- Prepares data for optimal performance in subsequent clustering analysis

The scaled datasets are now ready for baseline clustering model development.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_transformation_1_explanations', value=data_transformation_1_explanations)

### D.3 Data Transformation <put_name_here>

In [None]:
print("No additional data transformations required for clustering.")
print("All necessary preprocessing completed in previous steps:")
print("✓ Feature selection and encoding")
print("✓ Data standardization (StandardScaler)")
print("✓ Data ready for clustering algorithms")

In [None]:
data_transformation_2_explanations = """
Rationale for No Additional Transformations:

Comprehensive Preprocessing Already Completed:
1. Feature Selection (Section A): Selected 12 business-relevant features
2. Data Encoding (Section C):
   - Ordinal encoding for ordered categories
   - One-hot encoding for nominal categories
3. Data Standardization (D.2): All features scaled to comparable ranges

Why No Further Transformations Needed:
- Feature Count (20): Reasonable for clustering, no dimensionality reduction required
- Data Quality: High consistency and proper typing achieved
- Business Interpretability: Maintained original feature meanings
- Clustering Readiness: All algorithms can work effectively with current data state

The dataset is now optimally prepared for baseline clustering analysis and model development.
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_transformation_2_explanations', value=data_transformation_2_explanations)

### D.4 Data Transformation <put_name_here>


In [None]:
# <Student to fill this section and then remove this comment>

In [None]:
# <Student to fill this section and then remove this comment>
data_transformation_3_explanations = """
Provide some explanations on why you believe it is important to perform this data transformation and its impacts
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_transformation_3_explanations', value=data_transformation_3_explanations)

### D.n Fixing "\<describe_issue_here\>"

> You can add more cells related to data preparation in this section

In [None]:
# <Student to fill this section and then remove this comment>

In [None]:
# <Student to fill this section and then remove this comment>
data_transformation_n_explanations = """
Provide some explanations on why you believe it is important to perform this data transformation and its impacts
"""

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL
print_tile(size="h3", key='data_transformation_n_explanations', value=data_transformation_n_explanations)

---
## E. Save Datasets

> Do not change this code

In [None]:
# DO NOT MODIFY THE CODE IN THIS CELL

try:
  X_train.to_csv(at.folder_path / 'X_train.csv', index=False)

  X_val.to_csv(at.folder_path / 'X_val.csv', index=False)

  X_test.to_csv(at.folder_path / 'X_test.csv', index=False)
except Exception as e:
  print(e)