In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import mutual_info_classif

In [4]:
# Load and Explore Dataset
# Load dataset

df = pd.read_csv('/content/adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Strip spaces from string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Summary statistics
print("\nNumeric Summary:\n", df.describe())


Shape: (32561, 15)
Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

Numeric Summary:
                 age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count 

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [6]:
# Missing values check
print("\nMissing values:\n", df.isnull().sum())

# Replace '?' with NaN for categorical cleanup
df.replace('?', np.nan, inplace=True)
print("\nMissing after '?' replacement:\n", df.isnull().sum())

# Imputation: categorical = mode, numeric = median
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

print("\nMissing after imputation:\n", df.isnull().sum())



Missing values:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

Missing after '?' replacement:
 age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

Missing after imputation:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [7]:
# Scaling Techniques
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove("education_num") if "education_num" in num_cols else None  # optional removal

# Standard Scaling
scaler_std = StandardScaler()
df_std = df.copy()
df_std[num_cols] = scaler_std.fit_transform(df[num_cols])
df_std.to_csv("adult_standard_scaled.csv", index=False)

# Min-Max Scaling
scaler_mm = MinMaxScaler()
df_mm = df.copy()
df_mm[num_cols] = scaler_mm.fit_transform(df[num_cols])
df_mm.to_csv("adult_minmax_scaled.csv", index=False)

print("\nScaling applied. Files saved: adult_standard_scaled.csv, adult_minmax_scaled.csv")




Scaling applied. Files saved: adult_standard_scaled.csv, adult_minmax_scaled.csv


**StandardScaler**
- Centers data around mean=0, std=1.
- Use when data is normally distributed and when using models like Logistic Regression,SVM,PCA.

**Min-MaxScaler**
- Transforms values into the 0-1 range.
- Use when algorithms are distance based(KNN,Clustering)/Neural Network.
- Sensitive to outliers because one extreme values can compress all others.



In [8]:
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cardinality = {c: df[c].nunique() for c in cat_cols}
card_df = pd.DataFrame.from_dict(cardinality, orient='index', columns=['n_unique']).sort_values('n_unique')
display(card_df)

# Choose columns
onehot_cols = [c for c,n in cardinality.items() if n < 5]
label_cols = [c for c,n in cardinality.items() if n >= 5]

print("One-hot columns (<5):", onehot_cols)
print("Label-encode columns (>=5):", label_cols)

# Apply One-hot (get_dummies)
df_enc = pd.get_dummies(df, columns=onehot_cols, drop_first=False)

# Label encode the others
le = LabelEncoder()
for c in label_cols:
    df_enc[c] = le.fit_transform(df_enc[c].astype(str))

Unnamed: 0,n_unique
sex,2
income,2
race,5
relationship,6
marital_status,7
workclass,8
occupation,14
education,16
native_country,41


One-hot columns (<5): ['sex', 'income']
Label-encode columns (>=5): ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']


**Encoding Techniques**
- One-Hot: no ordinal assumption; but explodes dimension if many categories.

- Label: compact, but introduces ordering — ok for tree models, problematic for linear models unless transformed (or use target encoding with caution).

- Alternative: target encoding or embeddings (for high-cardinality categorical vars) using cross-validated strategies.

****

In [10]:
df_fe = df.copy()

# Feature 1: Net capital gain
df_fe['capital_net'] = df_fe['capital_gain'] - df_fe['capital_loss']
print("Capital_net:",df_fe['capital_net'])
print("\n")

# Feature 2: Age buckets
#df_fe['age_bucket'] = pd.cut(df_fe['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])
df_fe['age_hours_ratio'] = df_fe['age'] / (df_fe['hours_per_week'] + 1)
print("Age_hours_ratio:",df_fe['age_hours_ratio'])
print("\n")

# Transformation: log1p for skewed capital_gain
df_fe['capital_gain_log1p'] = np.log1p(df_fe['capital_gain'])
print("Capital_gain_log1p:",df_fe['capital_gain_log1p'])
print("\n")

df_fe.to_csv("adult_feature_engineered.csv", index=False)
print("\nFeature engineering applied. File saved: adult_feature_engineered.csv")

Capital_net: 0         2174
1            0
2            0
3            0
4            0
         ...  
32556        0
32557        0
32558        0
32559        0
32560    15024
Name: capital_net, Length: 32561, dtype: int64


Age_hours_ratio: 0        0.951220
1        3.571429
2        0.926829
3        1.292683
4        0.682927
           ...   
32556    0.692308
32557    0.975610
32558    1.414634
32559    1.047619
32560    1.268293
Name: age_hours_ratio, Length: 32561, dtype: float64


Capital_gain_log1p: 0        7.684784
1        0.000000
2        0.000000
3        0.000000
4        0.000000
           ...   
32556    0.000000
32557    0.000000
32558    0.000000
32559    0.000000
32560    9.617471
Name: capital_gain_log1p, Length: 32561, dtype: float64



Feature engineering applied. File saved: adult_feature_engineered.csv


# **Feature Engineering/ Creating new feature**
Feature 1 : *[capital_net] = [capital_gain] - [ capital_loss]*
- In the original dataset, *capital_gain* and *capital_loss* are two separate variables.
However, what really matters for income prediction is the **net effect of capital transactions**.

Feature 2 : *age_hours_ratio = age/(hours_per_week+1)*
- It's tells us **how many years of age per working hour a person has**.
This ratio helps the model understand work effort compared to age.(+1 is just to avoid dividing by zero).

**Transformation of Skewed Feature**
 Log Tranform on *Capital_gain*
- It's use because most people have *(Capital_gain=0)*,but a few have very large values.this makes the data very uneven skewed.
By using* log(1+value)*,we reduce big numbers and make the distribution more balanced,so the model can learn better.




In [13]:
df_fe = df.copy()

# Feature 1: Net capital gain
df_fe['capital_net'] = df_fe['capital_gain'] - df_fe['capital_loss']
print("Capital_net:",df_fe['capital_net'])
print("\n")

# Feature 2: Age buckets
df_fe['age_bucket'] = pd.cut(df_fe['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])
print("Age_bucket:",df_fe['age_bucket'])
print("\n")

# Transformation: log1p for skewed capital_gain
df_fe['capital_gain_log1p'] = np.log1p(df_fe['capital_gain'])
print("Capital_gain_log1p:",df_fe['capital_gain_log1p'])
print("\n")

df_fe.to_csv("adult_feature_engineered.csv", index=False)
print("\nFeature engineering applied. File saved: adult_feature_engineered.csv")

# Encoding categorical features using one-hot encoding
categorical_cols = df_fe.select_dtypes(include=['object', 'category']).columns.tolist()
df_enc = pd.get_dummies(df_fe, columns=categorical_cols, drop_first=True)

print("\nCategorical features encoded. Shape of df_enc:", df_enc.shape)

# Isolation Forest for outlier removal
iso = IsolationForest(contamination=0.01, random_state=42)
outlier_pred = iso.fit_predict(df_enc.drop("income_>50K", axis=1))

print("\nOutliers detected:", (outlier_pred == -1).sum())

df_no_out = df_enc[outlier_pred == 1]
df_no_out.to_csv("adult_cleaned_no_outliers.csv", index=False)
print("Outliers removed. File saved: adult_cleaned_no_outliers.csv")

Capital_net: 0         2174
1            0
2            0
3            0
4            0
         ...  
32556        0
32557        0
32558        0
32559        0
32560    15024
Name: capital_net, Length: 32561, dtype: int64


Age_bucket: 0        1
1        2
2        1
3        2
4        1
        ..
32556    1
32557    1
32558    2
32559    0
32560    2
Name: age_bucket, Length: 32561, dtype: category
Categories (4, int64): [0 < 1 < 2 < 3]


Capital_gain_log1p: 0        7.684784
1        0.000000
2        0.000000
3        0.000000
4        0.000000
           ...   
32556    0.000000
32557    0.000000
32558    0.000000
32559    0.000000
32560    9.617471
Name: capital_gain_log1p, Length: 32561, dtype: float64



Feature engineering applied. File saved: adult_feature_engineered.csv

Categorical features encoded. Shape of df_enc: (32561, 104)

Outliers detected: 326
Outliers removed. File saved: adult_cleaned_no_outliers.csv


**Outliers detected = 652 rows (~2%)**
-
They distort scaling (MinMax shrinks normal data).

- They mislead distance-based algorithms (KNN, clustering).

- They pull regression lines away from true patterns.

In [None]:
df_fe = df.copy()

# Feature 1: Net capital gain
df_fe['capital_net'] = df_fe['capital_gain'] - df_fe['capital_loss']

# Feature 2: Age buckets
df_fe['age_bucket'] = pd.cut(df_fe['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])

# Transformation: log1p for skewed capital_gain
df_fe['capital_gain_log1p'] = np.log1p(df_fe['capital_gain'])

df_fe.to_csv("adult_feature_engineered.csv", index=False)
print("\nFeature engineering applied. File saved: adult_feature_engineered.csv")

# Encoding categorical features using one-hot encoding
categorical_cols = df_fe.select_dtypes(include=['object', 'category']).columns.tolist()
df_enc = pd.get_dummies(df_fe, columns=categorical_cols, drop_first=True)

print("\nCategorical features encoded. Shape of df_enc:", df_enc.shape)


Feature engineering applied. File saved: adult_feature_engineered.csv

Categorical features encoded. Shape of df_enc: (32561, 104)


In [14]:
!pip install ppscore



In [22]:
import ppscore as pps

# PPS matrix
pps_matrix = pps.matrix(df_fe[['education_num', 'hours_per_week', 'capital_net', 'income']])
print(pps_matrix[['x', 'y', 'ppscore']])

# Correlation matrix
corr = df_enc.corr()['income_>50K'].sort_values(ascending=False)
print(corr)

                 x               y   ppscore
0    education_num   education_num  1.000000
1    education_num  hours_per_week  0.000000
2    education_num     capital_net  0.000000
3    education_num          income  0.243135
4   hours_per_week   education_num  0.000000
5   hours_per_week  hours_per_week  1.000000
6   hours_per_week     capital_net  0.000000
7   hours_per_week          income  0.047278
8      capital_net   education_num  0.009724
9      capital_net  hours_per_week  0.000000
10     capital_net     capital_net  1.000000
11     capital_net          income  0.400876
12          income   education_num  0.028055
13          income  hours_per_week  0.000000
14          income     capital_net  0.000000
15          income          income  1.000000
income_>50K                          1.000000
marital_status_Married-civ-spouse    0.444696
education_num                        0.335154
capital_gain_log1p                   0.289462
age                                  0.234037
     

**Correlation:**

Shows only linear relationships.

Example: education_num vs income might show weak correlation, but actually has predictive power.

**PPS:**

Works for categorical + numeric features.

Detects non-linear effects (e.g., higher education leads to much higher income after a threshold).

Always between 0–1.

In [18]:
'''X = df_enc.drop("income_>50K", axis=1)
y = df_enc["income_>50K"]

mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=42)
mi_df = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

print("\nMutual Information scores (approx PPS):\n", mi_df.head(10))

# Compare with Pearson correlation
corr = df_enc.corr()['income_>50K'].sort_values(ascending=False)
print("\nPearson correlation with income:\n", corr.head(10))

print("\nAll preprocessing, feature engineering, and selection steps completed.")


Mutual Information scores (approx PPS):
 capital_net                          0.119807
marital_status_Married-civ-spouse    0.106731
capital_gain_log1p                   0.082778
capital_gain                         0.077332
age_hours_ratio                      0.074881
age                                  0.067368
education_num                        0.063734
marital_status_Never-married         0.063590
hours_per_week                       0.045069
relationship_Own-child               0.040999
dtype: float64

Pearson correlation with income:
 income_>50K                          1.000000
marital_status_Married-civ-spouse    0.444696
education_num                        0.335154
capital_gain_log1p                   0.289462
age                                  0.234037
age_bucket_2                         0.231236
hours_per_week                       0.229689
capital_gain                         0.223329
sex_Male                             0.215980
occupation_Exec-managerial        