# EDA and Feature Engineering on Adult Dataset

In [ ]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt


In [ ]:
df = pd.read_csv('/mnt/data/adult_with_headers.csv')
df.replace('?', np.nan, inplace=True)
df.head()

In [ ]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

In [ ]:
# Scaling
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = StandardScaler().fit_transform(df[num_cols])

In [ ]:
# Encoding
for col in df.select_dtypes(include='object').columns:
    if df[col].nunique() < 5:
        df = pd.get_dummies(df, columns=[col])
    else:
        df[col] = LabelEncoder().fit_transform(df[col])

In [ ]:
# Feature Engineering
df['education_hours_ratio'] = df['education_num'] / (df['hours_per_week'] + 1)
df['capital_total'] = df['capital_gain'] + df['capital_loss']

In [ ]:
# Outlier Detection
iso = IsolationForest(contamination=0.05, random_state=42)
outliers = iso.fit_predict(df[num_cols])
df = df[outliers == 1]
df.shape