In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('adult_with_headers.csv')
print(df.head())  # View the first few rows
print(df.info())  # Data types and null values


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [3]:
# Check for missing values
print(df.isnull().sum())

# Fill or drop based on context, e.g., impute mode for categorical features, mean for numerical
df['workclass'].fillna(df['workclass'].mode()[0], inplace=True)
df['occupation'].fillna(df['occupation'].mode()[0], inplace=True)
df.dropna(inplace=True)  # Remove any remaining rows with missing values


age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaling
scaler_std = StandardScaler()
df['age_std'] = scaler_std.fit_transform(df[['age']])

# Min-Max Scaling
scaler_mm = MinMaxScaler()
df[['capital_gain_mm', 'capital_loss_mm']] = scaler_mm.fit_transform(df[['capital_gain', 'capital_loss']])


In [5]:
df = pd.get_dummies(df, columns=['sex', 'income'], drop_first=True)


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['occupation_enc'] = label_encoder.fit_transform(df['occupation'])


In [7]:
df['age_hours'] = df['age'] * df['hours_per_week']
df['capital_net'] = df['capital_gain'] - df['capital_loss']


In [8]:
import numpy as np

# Log transform for skewed data
df['capital_gain_log'] = np.log1p(df['capital_gain'])


In [9]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df.select_dtypes(include=[float, int]))
df = df[outliers == 1]  # Keep only non-outliers


In [10]:
!pip install ppscore
import ppscore as pps

# Calculate PPS matrix
pps_matrix = pps.matrix(df)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
print(pps_matrix)

# Compare with correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)






x                          age  age_hours       age_std  capital_gain  \
y                                                                       
age               1.000000e+00   0.607119  9.998144e-01      0.000000   
age_hours         4.766769e-01   1.000000  4.765544e-01      0.000000   
age_std           9.998144e-01   0.607119  1.000000e+00      0.000000   
capital_gain      0.000000e+00   0.000000  0.000000e+00      1.000000   
capital_gain_log  0.000000e+00   0.000000  0.000000e+00      0.998889   
capital_gain_mm   0.000000e+00   0.000000  0.000000e+00      0.995957   
capital_loss      0.000000e+00   0.000000  0.000000e+00      0.000000   
capital_loss_mm   0.000000e+00   0.000000  0.000000e+00      0.000000   
capital_net       0.000000e+00   0.000000  0.000000e+00      0.706632   
education         7.477583e-02   0.063273  7.488399e-02      0.000000   
education_num     0.000000e+00   0.000000  0.000000e+00      0.000000   
fnlwgt            0.000000e+00   0.000000  0.000000

  correlation_matrix = df.corr()
