STEP 1 — Load & Preview Data

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


In [18]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income"
]

df = pd.read_csv(url, names=columns, sep=", ", engine="python")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


STEP 2 — Basic Data Inspection

In [19]:
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


STEP 3 — Missing Value Handling

3.1 Check Missing Values

(In this dataset, missing values are represented by "?")

In [20]:
(df == '?').sum()


Unnamed: 0,0
age,0
workclass,1836
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,1843
relationship,0
race,0
sex,0


3.2 Replace "?" with NaN

In [21]:
df.replace('?', np.nan, inplace=True)



3.3 Fill workclass with Mode

In [22]:
df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])


3.4 Fill occupation with Mode

In [23]:
df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])


3.5 Fill native_country with Mode

In [24]:
df['native_country'] = df['native_country'].fillna(df['native_country'].mode()[0])


3.6 Final Missing Value Check

In [25]:
df.isnull().sum()


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


STEP 4 — FEATURE ENGINEERING (3 FEATURES)

FE 1 — Capital Difference

In [26]:
df['capital_diff'] = df['capital_gain'] - df['capital_loss']
df['capital_diff']


Unnamed: 0,capital_diff
0,2174
1,0
2,0
3,0
4,0
...,...
32556,0
32557,0
32558,0
32559,0


FE 2 — Working Intensity

In [27]:
df['work_intensity'] = df['hours_per_week'] / df['age']
df['work_intensity']


Unnamed: 0,work_intensity
0,1.025641
1,0.260000
2,1.052632
3,0.754717
4,1.428571
...,...
32556,1.407407
32557,1.000000
32558,0.689655
32559,0.909091


FE 3 — Education Category

In [28]:
df['education_level'] = df['education_num'].apply(
    lambda x: 'Low' if x <= 8 else 'Medium' if x <= 12 else 'High'
)
df['education_level']


Unnamed: 0,education_level
0,High
1,High
2,Medium
3,Low
4,High
...,...
32556,Medium
32557,Medium
32558,Medium
32559,Medium


STEP 5 — ENCODING (Categorical → Numeric)

5.1 One-Hot Encoding

In [29]:
categorical_cols = [
    'workclass', 'education', 'marital_status',
    'occupation', 'relationship', 'race',
    'sex', 'native_country', 'education_level'
]

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


5.2 Encode Target Variable

In [30]:
df_encoded['income'] = df_encoded['income'].map({'<=50K': 0, '>50K': 1})


In [31]:
df.shape, df_encoded.shape


((32561, 18), (32561, 102))

STEP 6 — FEATURE SCALING

In [32]:
scale_cols = [
    'age', 'fnlwgt', 'education_num',
    'capital_gain', 'capital_loss',
    'hours_per_week', 'capital_diff', 'work_intensity'
]

scaler = StandardScaler()
df_encoded[scale_cols] = scaler.fit_transform(df_encoded[scale_cols])


In [33]:
df_encoded[scale_cols].describe()


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,capital_diff,work_intensity
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-2.705915e-17,-1.001625e-16,1.471887e-16,1.309314e-17,1.0169e-16,-1.5493550000000002e-17,1.745752e-17,-3.58752e-16
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403,-0.7216138,-2.362655
25%,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945,-0.1336701,-0.6792775
50%,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945,-0.1336701,-0.1097137
75%,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194,-0.1336701,0.6022411
max,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967,13.36352,8.295849


STEP 7 — Save Final Dataset

In [34]:
df_encoded.to_csv("adult_income_preprocessed.csv", index=False)
