In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("adult.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
# Numerical features
numerical_cols = [
    'age', 'fnlwgt', 'education.num',
    'capital.gain', 'capital.loss', 'hours.per.week'
]

# Categorical features
categorical_cols = [
    'workclass', 'education', 'marital.status',
    'occupation', 'relationship', 'race',
    'sex', 'native.country'
]

# Target
target = 'income'


In [None]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)


In [None]:
df.isnull().sum()


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education.num,0
marital.status,0
occupation,0
relationship,0
race,0
sex,0


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df[target] = le.fit_transform(df[target])


In [None]:
df_encoded = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True
)


In [None]:
df_encoded.shape


(30162, 97)

In [None]:
df_encoded[numerical_cols].describe()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(
    df_encoded[numerical_cols]
)


In [None]:
df_encoded[numerical_cols].describe()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,1.413453e-16,1.6019130000000002e-17,-2.99652e-16,-1.8846040000000002e-17,-3.015366e-17,-2.779791e-16
std,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017
min,-1.632189,-1.666094,-3.577051,-0.1474446,-0.218586,-3.333218
25%,-0.7946967,-0.6830644,-0.4397382,-0.1474446,-0.218586,-0.07773411
50%,-0.1094756,-0.1076072,-0.04757405,-0.1474446,-0.218586,-0.07773411
75%,0.6518811,0.4527602,1.128918,-0.1474446,-0.218586,0.3396356
max,3.925715,12.25647,2.305411,13.35458,10.55581,4.847229


In [None]:
### Impact of Scaling

Before scaling, numerical features had different ranges which could bias
distance-based and gradient-based machine learning algorithms.

After applying StandardScaler, all numerical features have mean 0 and
standard deviation 1, ensuring fair contribution during model training.


In [None]:
df_encoded.to_csv("adult_preprocessed.csv", index=False)
