In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest

In [2]:
df=pd.read_csv('adult_with_headers.csv')

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
missing_values=df.isnull().sum()
missing_values

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [10]:
data_types=df.dtypes
data_types

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [12]:
df.shape

(32561, 15)

In [14]:
print(df.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')


In [15]:
#Numerical Features
num_feat= ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [17]:
# Standard Scaling
Scale_st=StandardScaler()
df[num_feat]=Scale_st.fit_transform(df[num_feat])

In [18]:
# MinMax Scaling
Scale_minmax=MinMaxScaler()
df[num_feat]=Scale_minmax.fit_transform(df[num_feat])

In [19]:
# One-Hot Encoding for categorical variables with less than 5 categories

categorical_features_onehot = ['workclass', 'education', 'marital_status', 'relationship', 'race', 'sex']
encoder_onehot = OneHotEncoder()
encoded_onehot = encoder_onehot.fit_transform(df[categorical_features_onehot])


In [20]:
# Label Encoding for categorical variables with more than 5 categories
categorical_features_label = ['occupation', 'native_country']
encoder_label = LabelEncoder()
for feature in categorical_features_label:
    df[feature] = encoder_label.fit_transform(df[feature])

In [21]:
# Feature Engineering

df['capital diff']= df['capital_gain']- df['capital_loss']
df['age_hours_ratio'] = df['age'] / df['hours_per_week']

In [22]:
# Apply log transformation to 'capital_gain'
import numpy as np
df['capital_gain_log'] = np.log(df['capital_gain'] + 1)

In [24]:
# Define the Isolation Forest model
clf = IsolationForest(random_state=42, contamination=0.01)  # Contamination is the proportion of outliers

# Fit the Isolation Forest model to numerical features
clf.fit(df[num_feat])

# Predict outliers
outliers = clf.predict(df[num_feat])

# Remove outliers
df_cleaned = df[outliers != -1]

# Print the shape of the cleaned dataset
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

Original dataset shape: (32561, 18)
Cleaned dataset shape: (32235, 18)


In [26]:
# Compute Pearson correlation coefficient manually
correlation_matrix = df_cleaned[num_feat].corr()


In [27]:
# Square the absolute values of the correlation coefficients to get the PPS
pps_matrix = correlation_matrix.applymap(lambda x: np.square(abs(x)))

print(pps_matrix)

                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000  0.005976       0.000988      0.011704      0.001659   
fnlwgt          0.005976  1.000000       0.001922      0.000067      0.000176   
education_num   0.000988  0.001922       1.000000      0.020163      0.006228   
capital_gain    0.011704  0.000067       0.020163      1.000000      0.002273   
capital_loss    0.001659  0.000176       0.006228      0.002273      1.000000   
hours_per_week  0.005061  0.000467       0.020770      0.006858      0.002087   

                hours_per_week  
age                   0.005061  
fnlwgt                0.000467  
education_num         0.020770  
capital_gain          0.006858  
capital_loss          0.002087  
hours_per_week        1.000000  
