In [1]:

import pandas as pd
import numpy as np
import kagglehub
from kagglehub import KaggleDatasetAdapter


In [4]:

# Load Adult Income dataset from Kaggle
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "wenruliu/adult-income-dataset",
    "adult.csv"
)

df.head()


Downloading from https://www.kaggle.com/api/v1/datasets/download/wenruliu/adult-income-dataset?dataset_version_number=2&file_name=adult.csv...


100%|██████████| 652k/652k [00:00<00:00, 83.2MB/s]

Extracting zip of adult.csv...





Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


## Dataset Overview

In [5]:

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [6]:

df.describe(include='all')


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,48842,48842.0,48842,48842.0,48842,48842,48842,48842,48842,48842.0,48842.0,48842.0,48842,48842
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832,37155
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,,


## Data Cleaning – Handling Missing Values

In [7]:

# Check missing values
df.isnull().sum()


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [8]:

# Replace '?' with NaN (common in Adult Income dataset)
df.replace('?', np.nan, inplace=True)

# Fill numerical columns with mean
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isnull().sum()


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


## Handling Categorical Data

In [9]:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df.dtypes


Unnamed: 0,0
age,int64
workclass,int64
fnlwgt,int64
education,int64
educational-num,int64
marital-status,int64
occupation,int64
relationship,int64
race,int64
gender,int64


## Handling Outliers (IQR Method)

In [10]:

Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[num_cols] < (Q1 - 1.5 * IQR)) | (df[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]


## Data Transformation – Min-Max Scaling

In [11]:

from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
df_minmax = df.copy()
df_minmax[df.columns] = minmax.fit_transform(df)

df_minmax.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.131148,0.428571,0.527441,0.066667,0.181818,0.666667,0.461538,0.6,0.5,1.0,0.0,0.0,0.368421,0.95,0.0
1,0.344262,0.428571,0.188277,0.733333,0.363636,0.333333,0.307692,0.0,1.0,1.0,0.0,0.0,0.894737,0.95,0.0
2,0.180328,0.142857,0.800155,0.466667,0.636364,0.333333,0.769231,0.0,1.0,1.0,0.0,0.0,0.368421,0.95,1.0
6,0.196721,0.428571,0.527996,0.733333,0.363636,0.666667,0.692308,0.8,0.5,1.0,0.0,0.0,0.368421,0.95,0.0
8,0.114754,0.428571,0.881156,1.0,0.454545,0.666667,0.538462,0.8,1.0,0.0,0.0,0.0,0.368421,0.95,0.0


## Data Transformation – Standardization

In [12]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standard = df.copy()
df_standard[df.columns] = scaler.fit_transform(df)

df_standard.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-1.080704,-0.029348,0.532459,-2.62739,-1.511255,0.954066,0.039293,0.986908,-1.896217,0.701747,0.0,0.0,-0.371281,0.242009,-0.516767
1,-0.003389,-0.029348,-1.041839,0.1328,-0.580344,-0.359669,-0.45504,-0.895254,0.405535,0.701747,0.0,0.0,2.154155,0.242009,-0.516767
2,-0.832093,-1.81449,1.798318,-0.971276,0.816022,-0.359669,1.027959,-0.895254,0.405535,0.701747,0.0,0.0,-0.371281,0.242009,1.935108
6,-0.749222,-0.029348,0.535034,0.1328,-0.580344,0.954066,0.780793,1.614296,-1.896217,0.701747,0.0,0.0,-0.371281,0.242009,-0.516767
8,-1.163574,-0.029348,2.174298,1.236876,-0.114889,0.954066,0.28646,1.614296,0.405535,-1.425016,0.0,0.0,-0.371281,0.242009,-0.516767
