In [3]:
##Data Exploration and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

In [3]:
df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)

In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [8]:
print(df.describe())

                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [9]:
print(df.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [10]:
##Missing Value Handling:

In [63]:
df_cleaned=df.dropna()

In [65]:
print("\nShape before cleaning:", df.shape)


Shape before cleaning: (32235, 19)


In [67]:
print("Shape after cleaning:", df_cleaned.shape)

Shape after cleaning: (32235, 19)


In [12]:
##Scaling Techniques:

In [13]:
numerical_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [14]:
standard_scaled = pd.DataFrame(StandardScaler().fit_transform(df[numerical_cols]), columns=numerical_cols)

In [15]:
standard_scaled

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [16]:
minmax_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df[numerical_cols]), columns=numerical_cols)

In [17]:
minmax_scaled

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878


In [18]:
##Encoding Techniques

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
ohe_cols = [col for col in df.select_dtypes('object').columns if df[col].nunique() < 5 and col != 'income']

In [21]:
df = pd.get_dummies(df, columns=ohe_cols)

In [22]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,income,sex_Female,sex_Male
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,<=50K,False,True
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,<=50K,False,True
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,<=50K,False,True
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,<=50K,False,True
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,<=50K,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,0,0,38,United-States,<=50K,True,False
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,40,United-States,>50K,False,True
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,0,0,40,United-States,<=50K,True,False
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,0,0,20,United-States,<=50K,False,True


In [23]:
le = LabelEncoder()
for col in df.select_dtypes('object').columns:
    if col != 'income':
        df[col] = le.fit_transform(df[col])

In [24]:
##Feature Engineering

In [45]:
df['working_hours_group'] = pd.cut(df['hours-per-week'], bins=[0, 20, 40, 60, 100], labels=['Low', 'Medium', 'High', 'Very High'])


In [47]:
df['working_hours_group'] 

0        Medium
1           Low
2        Medium
3        Medium
4        Medium
          ...  
32556    Medium
32557    Medium
32558    Medium
32559       Low
32560    Medium
Name: working_hours_group, Length: 32561, dtype: category
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']

In [49]:
df['age_bucket'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Mid-age', 'Senior', 'Old'])


In [51]:
df['age_bucket']

0        Mid-age
1         Senior
2        Mid-age
3         Senior
4        Mid-age
          ...   
32556    Mid-age
32557    Mid-age
32558     Senior
32559      Young
32560     Senior
Name: age_bucket, Length: 32561, dtype: category
Categories (4, object): ['Young' < 'Mid-age' < 'Senior' < 'Old']

In [53]:
df['log_fnlwgt'] = np.log1p(df['fnlwgt'])

In [55]:
df['log_fnlwgt']

0        11.258253
1        11.330348
2        12.281398
3        12.366157
4        12.732013
           ...    
32556    12.458010
32557    11.947140
32558    11.931050
32559    12.213500
32560    12.570466
Name: log_fnlwgt, Length: 32561, dtype: float64

In [30]:
 ##Feature Selection

In [31]:
##Isolation Forest for Outlier Detection:

In [57]:
from sklearn.ensemble import IsolationForest

In [59]:
iso = IsolationForest(contamination=0.01, random_state=42)
outliers = iso.fit_predict(df[numerical_cols])
df = df[outliers == 1]

In [61]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,income,sex_Female,sex_Male,working_hours_group,age_bucket,log_fnlwgt
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,<=50K,False,True,Medium,Mid-age,11.258253
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,<=50K,False,True,Low,Senior,11.330348
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,<=50K,False,True,Medium,Mid-age,12.281398
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,<=50K,False,True,Medium,Senior,12.366157
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,<=50K,True,False,Medium,Mid-age,12.732013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,38,39,<=50K,True,False,Medium,Mid-age,12.458010
32557,40,4,154374,11,9,2,7,0,4,0,0,40,39,>50K,False,True,Medium,Mid-age,11.947140
32558,58,4,151910,11,9,6,1,4,4,0,0,40,39,<=50K,True,False,Medium,Senior,11.931050
32559,22,4,201490,11,9,4,1,3,4,0,0,20,39,<=50K,False,True,Low,Young,12.213500


In [34]:
##PPS Score vs Correlation:

In [35]:
!pip install ppscore

Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Using cached pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
xarray 2025.3.1 requires pandas>=2.1, but you have pandas 1.5.3 which is incompatible.
dask-expr 1.1.21 requires pandas>=2, but you have pandas 1.5.3 which is incompatible.
cudf-cu12 25.2.1 requires pandas<2.2.4dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
mizani 0.13.5 requir

In [1]:
!pip install --upgrade --force-reinstall numpy pandas

Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Using cached pytz-2025.2-py2.py3-n

In [1]:
import ppscore as pps

In [4]:
import pandas as pd

In [6]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

In [7]:
df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)

In [8]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [9]:
pps_matrix = df.apply(lambda col: pps.score(df, col.name, 'income')['ppscore'])

In [11]:
df.dropna(inplace=True)

In [12]:
ohe_cols = [col for col in df.select_dtypes('object').columns if df[col].nunique() < 5 and col != 'income']
le_cols = [col for col in df.select_dtypes('object').columns if col not in ohe_cols and col != 'income']


In [13]:
df = pd.get_dummies(df, columns=ohe_cols)

In [16]:
from sklearn.preprocessing import LabelEncoder # Ensure LabelEncoder is imported

In [17]:
le = LabelEncoder()
for col in le_cols:
    df[col] = le.fit_transform(df[col])

In [19]:
df['income'] = le.fit_transform(df['income'])

In [21]:
import numpy as np

In [22]:
df['working_hours_group'] = pd.cut(df['hours-per-week'], bins=[0, 20, 40, 60, 100], labels=['Low', 'Medium', 'High', 'Very High'], include_lowest=True)
df['age_bucket'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Mid-age', 'Senior', 'Old'], include_lowest=True)
df['log_fnlwgt'] = np.log1p(df['fnlwgt'])

In [23]:
if 'working_hours_group' in df.columns and df['working_hours_group'].dtype == 'object':
    df['working_hours_group'] = le.fit_transform(df['working_hours_group'])
if 'age_bucket' in df.columns and df['age_bucket'].dtype == 'object':
    df['age_bucket'] = le.fit_transform(df['age_bucket'])

In [26]:
non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns

In [27]:
for col in non_numeric_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [28]:
corr_matrix = df.corr()['income'].sort_values(ascending=False)

In [29]:
print("PPS Scores:\n", pps_matrix.sort_values(ascending=False))

PPS Scores:
 income            1.000000
capital-gain      0.297123
education         0.243135
education-num     0.243135
capital-loss      0.141755
workclass         0.094056
occupation        0.092410
hours-per-week    0.047278
native-country    0.009409
age               0.005415
fnlwgt            0.000000
marital-status    0.000000
sex               0.000000
relationship      0.000000
race              0.000000
dtype: float64


In [30]:
print("Correlation:\n", corr_matrix)

Correlation:
 income                 1.000000
education-num          0.335154
age                    0.234037
hours-per-week         0.229689
capital-gain           0.223329
sex_Male               0.215980
capital-loss           0.150526
education              0.079317
occupation             0.075468
race                   0.071846
workclass              0.051604
native-country         0.015840
log_fnlwgt            -0.001374
fnlwgt                -0.009463
age_bucket            -0.125057
working_hours_group   -0.171391
marital-status        -0.199307
sex_Female            -0.215980
relationship          -0.250918
Name: income, dtype: float64


In [31]:
##Discuss the scenarios where each scaling technique is preferred and why

Standard Scaling (Standardization)
Formula:

𝑧
=
𝑥
−
𝜇
𝜎
z=
σ
x−μ
​

(Mean = 0, Std. Dev = 1)
Use StandardScaler when:
Scenario	Reason
🔍 Data is normally distributed or approximately Gaussian	Preserves distribution’s shape
📉 Models assume normality or rely on distances (e.g., SVM, Logistic Regression, PCA, K-Means, Linear Regression)	Ensures features contribute equally
📊 You have outliers, but still want to retain their effect	StandardScaler doesn’t squash outliers like MinMaxScaler does

❗ Be cautious if:
Your data is heavily skewed or has extreme outliers — standardization doesn’t eliminate their influence.

📏 2. Min-Max Scaling (Normalization)
Formula:

𝑥
𝑠
𝑐
𝑎
𝑙
𝑒
𝑑
=
𝑥
−
𝑥
𝑚
𝑖
𝑛
𝑥
𝑚
𝑎
𝑥
−
𝑥
𝑚
𝑖
𝑛
x
scaled
​
 =
x
max
​
 −x
min
​

x−x
min
​

​

(Scales to [0, 1])

✅ Use MinMaxScaler when:
Scenario	Reason
🧠 You are using neural networks / deep learning	Helps with gradient descent convergence by bounding values between 0 and 1
📉 You want to compress the scale and eliminate outlier impact	It squashes all values into the same range
🎯 All features have known and bounded min/max values	Ensures no data drift beyond expected bounds

❗ Be cautious if:
Your data has outliers — they will compress the rest of the data, reducing its resolution.

🆚 Summary Comparison
Criterion	StandardScaler	MinMaxScaler
Output range	Mean = 0, Std Dev = 1	[0, 1]
Sensitive to outliers?	✅ Yes	✅ Yes (even more sensitive)
Use with	SVM, PCA, KNN, Linear Models	Neural Networks, Image data
Keeps outlier scale	✅ Yes	❌ No (squashes)
Handles Gaussian data	✅ Yes	❌ No



In [32]:
##Discuss the pros and cons of One-Hot Encoding and Label Encoding

One-Hot Encoding
Pros:
Treats categories as separate (good for nominal data).
Avoids introducing false order.
Suitable for many algorithms sensitive to magnitude.
Cons:
Increases dimensionality significantly.
Can create sparse data.
Potential for multicollinearity (dummy variable trap).
Label Encoding
Pros:
Reduces dimensionality.
Saves memory.
Cons:
Introduces artificial order (bad for nominal data).
Misleads algorithms into treating labels as continuous.
Performance can be sensitive to value assignment.
Summary:
One-Hot Encoding: Use for nominal categories (no order).
Label Encoding: Use for ordinal categories (clear order), or sometimes with tree-based models for nominal data, but generally less preferred for nominal data.


In [33]:
##discuss the relationship between features. Compare its findings with the correlation matrix.

Predictive Power Score (PPS):

Measures how well one feature can predict another feature.
The score ranges from 0 to 1, where 0 means no predictive power and 1 means perfect predictive power.
It can capture non-linear relationships and asymmetric relationships (e.g., feature A predicts B well, but B doesn't predict A well).
It can be applied to both numerical and categorical features (though categorical features need to be handled appropriately, which we've done through encoding).
Correlation Matrix (Pearson Correlation):

Measures the linear relationship between two numerical features.
The score ranges from -1 to 1, where 1 indicates a perfect positive linear correlation, -1 indicates a perfect negative linear correlation, and 0 indicates no linear correlation.
It only captures linear relationships.
It assumes the variables are numerical and typically requires them to be approximately normally distributed for the p-value interpretation to be valid.
Discussion and Comparison:

Linear vs. Non-Linear/Asymmetric Relationships: The correlation matrix will only show the strength and direction of linear relationships. The PPS matrix can reveal relationships that are non-linear or where the prediction is much stronger in one direction than the other. For example, age might predict income to some extent (perhaps older people tend to earn more), but income is unlikely to predict age strongly. PPS can capture this asymmetry, while correlation measures a symmetric relationship.

Categorical Features: Correlation is designed for numerical features. While we have encoded our categorical features into numerical ones (using One-Hot and Label Encoding), interpreting the Pearson correlation between these encoded features can be misleading, especially for Label Encoded variables where the numbers imply an artificial order. PPS, however, is designed to handle different data types (or their encoded forms) and assess their predictive power, making it more suitable for understanding relationships involving categorical features.