In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder

In [2]:
df=pd.read_csv("adult_with_headers (1).csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.shape

(32561, 15)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
### Finding null values

df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [7]:
df["workclass"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [8]:
### replacing ? with nan values

df.replace(" ?",np.nan,inplace=True)

In [9]:
df["workclass"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', nan, ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [10]:
df["occupation"].unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', nan, ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [11]:
df["native_country"].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', nan, ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [12]:
### Imputation these categorical column missing value by mode

for col in ['workclass', 'occupation', 'native_country']:
    df[col] = df[col].fillna(df[col].mode()[0])

In [13]:
# Scaling Numerical Features
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
scaler_std = StandardScaler()
scaler_minmax = MinMaxScaler()

In [14]:
df_standard_scaled = df.copy()
df_standard_scaled[numerical_features] = scaler_std.fit_transform(df[numerical_features])

df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])

In [15]:
df_standard_scaled[numerical_features]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [16]:
df_minmax_scaled[numerical_features]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878


### Encoding Techniques

In [17]:
# One-Hot Encoding for variables with < 5 categories (e.g., sex, income)

df = pd.get_dummies(df, columns=['sex',"income"], drop_first=True)

In [18]:
# Label Encoding for variables with > 5 categories

label_encoder = LabelEncoder()

categorical_vars = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']

for col in categorical_vars:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

In [19]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,39,6,77516,9,13,4,0,1,4,2174,0,40,38,True,False
1,50,5,83311,9,13,2,3,0,4,0,0,13,38,True,False
2,38,3,215646,11,9,0,5,1,4,0,0,40,38,True,False
3,53,3,234721,1,7,2,5,0,2,0,0,40,38,True,False
4,28,3,338409,9,13,2,9,5,2,0,0,40,4,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,3,257302,7,12,2,12,5,4,0,0,38,38,False,False
32557,40,3,154374,11,9,2,6,0,4,0,0,40,38,True,True
32558,58,3,151910,11,9,6,0,4,4,0,0,40,38,False,False
32559,22,3,201490,11,9,4,0,3,4,0,0,20,38,True,False


One-Hot Encoding: Prevents the model from assuming an incorrect order between categories but can lead to high dimensionality if the category count is high.


Label Encoding: Memory-efficient and keeps the feature space small, but it can introduce a false mathematical hierarchy (that may confuse certain models.

### Feature Engineering

In [20]:
### create first feature

df["net_capital"] = df['capital_gain'] - df['capital_loss']

In [21]:
### create second feature

df['work_intensity'] = df['education_num'] * df['hours_per_week']

In [22]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,net_capital,work_intensity
0,39,6,77516,9,13,4,0,1,4,2174,0,40,38,True,False,2174,520
1,50,5,83311,9,13,2,3,0,4,0,0,13,38,True,False,0,169
2,38,3,215646,11,9,0,5,1,4,0,0,40,38,True,False,0,360
3,53,3,234721,1,7,2,5,0,2,0,0,40,38,True,False,0,280
4,28,3,338409,9,13,2,9,5,2,0,0,40,4,False,False,0,520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,3,257302,7,12,2,12,5,4,0,0,38,38,False,False,0,456
32557,40,3,154374,11,9,2,6,0,4,0,0,40,38,True,True,0,360
32558,58,3,151910,11,9,6,0,4,4,0,0,40,38,False,False,0,360
32559,22,3,201490,11,9,4,0,3,4,0,0,20,38,True,False,0,180


In [27]:
# Transformation of skewed feature (Log transformation for capital_gain)

# Calculate skewness for numerical columns
skewness = df[numerical_features].skew().sort_values(ascending=False)
print("Skewness of numerical features:")
print(skewness)

Skewness of numerical features:
capital_gain      11.953848
capital_loss       4.594629
fnlwgt             1.446980
age                0.558743
hours_per_week     0.227643
education_num     -0.311676
dtype: float64


In [None]:
df['capital_gain_log'] = np.log1p(df['capital_gain'])

In [29]:
skew_after = df['capital_gain_log'].skew()

print(f"\nSkewness of capital_gain before: {skewness['capital_gain']}")
print(f"Skewness of capital_gain after log transformation: {skew_after}")


Skewness of capital_gain before: 11.953847687699799
Skewness of capital_gain after log transformation: 3.096143524467517
