In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# 📂 Load Data
df = pd.read_csv('../data/Predict_Hair_Fall.csv')

In [4]:
# 🧹 Data Cleaning
df.drop_duplicates(inplace=True)
df['Age'] = df['Age'].fillna(df['Age'].median())

# Ensure column names have no leading/trailing spaces
df.columns = df.columns.str.strip()

cat_cols = ['Genetics', 'Hormonal Changes', 'Medical Conditions', 'Medications & Treatments',
            'Nutritional Deficiencies', 'Stress', 'Poor Hair Care Habits', 
            'Environmental Factors', 'Smoking', 'Weight Loss']

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [5]:
# 🗑️ Drop ID column
if 'Id' in df.columns:
    df.drop(columns=['Id'], inplace=True)

In [6]:
# 🧠 Feature Engineering
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 20, 35, 50, 100], 
                         labels=['Teen', 'Young Adult', 'Middle Aged', 'Senior'])

In [7]:
# ➕ Interaction Feature
df['Stress_Medical'] = df['Stress'].astype(str) + "_" + df['Medical Conditions'].astype(str)

In [8]:
# 🔄 Encoding
cat_features = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

In [9]:
# 🔢 Feature Scaling
scaler = StandardScaler()
df_encoded['Age'] = scaler.fit_transform(df_encoded[['Age']])

In [10]:
# ✅ Save processed data
df_encoded.to_csv('../data/processed_hair_loss.csv', index=False)
print("Preprocessing completed and saved to data/processed_hair_loss.csv")

Preprocessing completed and saved to data/processed_hair_loss.csv
