In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:

df = pd.read_csv("adult_with_headers (1).csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,32561.0,,,,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
workclass,32561.0,9.0,Private,22696.0,,,,,,,
fnlwgt,32561.0,,,,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education,32561.0,16.0,HS-grad,10501.0,,,,,,,
education_num,32561.0,,,,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
marital_status,32561.0,7.0,Married-civ-spouse,14976.0,,,,,,,
occupation,32561.0,15.0,Prof-specialty,4140.0,,,,,,,
relationship,32561.0,6.0,Husband,13193.0,,,,,,,
race,32561.0,5.0,White,27816.0,,,,,,,
sex,32561.0,2.0,Male,21790.0,,,,,,,


##### Missing Values Handling
Steps:
- Replace "?" with NaN  
- Fill categorical values with mode  
- Fill numerical values with median  
This follows best practices for structured datasets.


In [8]:
df = df.replace("?", np.nan)

cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(include=np.number).columns

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

#### Scaling Techniques

##### StandardScaler
- Converts data to mean = 0 and standard deviation = 1  
- Used when features follow approximately a normal distribution  
- Works well with **Logistic Regression, SVM, Linear Regression**

##### MinMaxScaler
- Converts values to range [0, 1]  
- Used when distribution is not normal  
- Useful for algorithms sensitive to feature magnitude (KNN, Neural Nets)


In [10]:
# Applying both scaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_std = StandardScaler()
scaler_mm = MinMaxScaler()

df_std = df.copy()
df_mm = df.copy()

df_std[num_cols] = scaler_std.fit_transform(df[num_cols])
df_mm[num_cols] = scaler_mm.fit_transform(df[num_cols])

df_std.head()


  from scipy.sparse import csr_matrix, issparse


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


#### Encoding Categorical Variables
##### One-Hot Encoding  
Used when categories < 5  
Pros: No order imposed  
Cons: Increases number of columns
##### Label Encoding  
Used when categories ≥ 5  
Pros: Compact  
Cons: Introduces order meaning unintentionally


In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

X = df.drop(columns=['income'])
y = df['income']

X_enc = X.copy()

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

for col in X.columns:
    if X[col].dtype == 'object':
        if X[col].nunique() < 5:
            ohe_df = pd.DataFrame(ohe.fit_transform(X[[col]]))
            ohe_df.columns = [f"{col}_{x}" for x in ohe.categories_[0]]
            X_enc = pd.concat([X_enc.drop(columns=[col]), ohe_df], axis=1)
        else:
            X_enc[col] = LabelEncoder().fit_transform(X[col])


##### Feature Engineering

We create 2 new features:

1. **capital_gain_flag** — identifies if capital-gain > 0  
2. **capital_loss_flag** — identifies if capital-loss > 0  
3. Apply **log transform** on skewed 'hours-per-week'


In [14]:
# Feature Engineering

# 1. Capital Gain Flag
X_enc['capital_gain_flag'] = (X['capital_gain'] > 0).astype(int)

# 2. Capital Loss Flag
X_enc['capital_loss_flag'] = (X['capital_loss'] > 0).astype(int)

# 3. Log transformation for skewed feature
X_enc['hours_log'] = np.log1p(X['hours_per_week'])

X_enc.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,capital_gain_flag,capital_loss_flag,hours_log
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,0.0,1.0,1,0,3.713572
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,0.0,1.0,0,0,2.639057
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,0.0,1.0,0,0,3.713572
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,0.0,1.0,0,0,3.713572
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,1.0,0.0,0,0,3.713572


##### Outlier Detection Using Isolation Forest
Isolation Forest identifies unusual data points by isolating them.


In [15]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(random_state=42)
outliers = iso.fit_predict(X_enc.select_dtypes(include=np.number))

mask = outliers == 1  # 1 = inlier, -1 = outlier

X_clean = X_enc[mask]
y_clean = y[mask]

X_clean.shape, y_clean.shape


((27632, 18), (27632,))

##### PPS Score (Predictive Power Score)
Shows how strong each feature predicts the target.
If PPS library is not installed, fallback to Mutual Information.


In [16]:
try:
    import ppscore as pps
    pps_available = True
except:
    pps_available = False

if pps_available:
    pps.matrix(pd.concat([X_clean, y_clean], axis=1))[['x','y','ppscore']]
else:
    from sklearn.feature_selection import mutual_info_classif
    mi = mutual_info_classif(
        X_clean.select_dtypes(include=np.number),
        LabelEncoder().fit_transform(y_clean)
    )
    pd.Series(mi, index=X_clean.select_dtypes(include=np.number).columns).sort_values(ascending=False)
