# Adult Dataset Preprocessing

## Identify Categorical and Numerical Features

In [1]:
import pandas as pd
df = pd.read_csv('adult.csv')
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(exclude='object').columns
categorical_cols, numerical_cols

(Index(['workclass', 'education', 'marital.status', 'occupation',
        'relationship', 'race', 'sex', 'native.country', 'income'],
       dtype='object'),
 Index(['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
        'hours.per.week'],
       dtype='object'))

## Label Encoding (Ordinal Feature)

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['education_encoded'] = le.fit_transform(df['education'])

df[['education', 'education_encoded']].head()

Unnamed: 0,education,education_encoded
0,HS-grad,11
1,HS-grad,11
2,Some-college,15
3,7th-8th,5
4,Some-college,15


## One-Hot Encoding (Nominal Features)

In [None]:
df_onehot = pd.get_dummies(
    df,
    columns=['workclass', 'marital-status', 'occupation', 'relationship',
             'race', 'sex', 'native-country'],
    drop_first=True
)

df_onehot.head()

## Feature Scaling using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['age', 'fnlwgt', 'education-num',
                  'capital-gain', 'capital-loss', 'hours-per-week']

df_onehot[numerical_cols] = scaler.fit_transform(df_onehot[numerical_cols])

df_onehot[numerical_cols].describe()

## Model Readiness Comparison

In [None]:
# Before Scaling
df[numerical_cols].describe()

# After Scaling
df_onehot[numerical_cols].describe()