In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
import ppscore as pps

### 1. Data exploration and preprocessing

In [9]:
adults= pd.read_csv("/content/adult_with_headers.cs")
adults

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [10]:
adults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [11]:
adults.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [12]:
adults.workclass.value_counts()

Unnamed: 0,workclass
Private,22696
Self-emp-not-inc,2541
Local-gov,2093
?,1836
State-gov,1298
Self-emp-inc,1116
Federal-gov,960
Without-pay,14
Never-worked,7


In [13]:
adults.occupation.value_counts()

Unnamed: 0,occupation
Prof-specialty,4140
Craft-repair,4099
Exec-managerial,4066
Adm-clerical,3770
Sales,3650
Other-service,3295
Machine-op-inspct,2002
?,1843
Transport-moving,1597
Handlers-cleaners,1370


##### Here we can see that no missing values are there in any columns of data but workclass, occupation, and other columns are having ? group in them which belongs to those of missing value category. So, I decided to drop those rows in which ? are present.

In [14]:
warnings.filterwarnings('ignore')
adult1 = adults[~adults.apply(lambda x: x.str.contains('\?', na=False).any(), axis=1)]
adult1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
adult1.reset_index(drop=True, inplace=True)

In [16]:
adult1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30162 entries, 0 to 30161
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   fnlwgt          30162 non-null  int64 
 3   education       30162 non-null  object
 4   education_num   30162 non-null  int64 
 5   marital_status  30162 non-null  object
 6   occupation      30162 non-null  object
 7   relationship    30162 non-null  object
 8   race            30162 non-null  object
 9   sex             30162 non-null  object
 10  capital_gain    30162 non-null  int64 
 11  capital_loss    30162 non-null  int64 
 12  hours_per_week  30162 non-null  int64 
 13  native_country  30162 non-null  object
 14  income          30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.5+ MB


In [17]:
adult1.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


#### Apply scaling technique for numerical column

##### Filter numerical columns

In [18]:
adult_numerical= adult1.iloc[:,[0,2,4,10,11,12]]
adult_numerical

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
30157,27,257302,12,0,0,38
30158,40,154374,9,0,0,40
30159,58,151910,9,0,0,40
30160,22,201490,9,0,0,20


##### Standard scaling

In [19]:
def get_standardized(x):
    return (x-x.mean())/x.std()

In [20]:
adult_stand= get_standardized(adult_numerical)
adult_stand.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.042795,-1.062704,1.1289,0.14609,-0.218582,-0.077733
1,0.880274,-1.007855,1.1289,-0.147442,-0.218582,-2.331492
2,-0.033339,0.244689,-0.439731,-0.147442,-0.218582,-0.077733
3,1.108677,0.425233,-1.224046,-0.147442,-0.218582,-0.077733
4,-0.794684,1.406635,1.1289,-0.147442,-0.218582,-0.077733


##### Min-Max scaling

In [21]:
def get_normalize(x):
    return (x-x.min())/(x.max()-x.min())

In [22]:
adult_normal= get_normalize(adult_numerical)
adult_normal.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.043338,0.8,0.02174,0.0,0.397959
1,0.452055,0.047277,0.8,0.0,0.0,0.122449
2,0.287671,0.137244,0.533333,0.0,0.0,0.397959
3,0.493151,0.150212,0.4,0.0,0.0,0.397959
4,0.150685,0.220703,0.8,0.0,0.0,0.397959


Standard Scaling:¶
Transforms features to have a mean of 0 and a standard deviation of 1. Ideal for algorithms sensitive to scale, like linear regression and SVMs. Handles normal distributions well but can be affected by outliers.








Min-Max Scaling:
Rescales features to a fixed range, usually [0, 1]. Useful for algorithms needing bounded inputs, such as neural networks. It can be sensitive to outliers but ensures features are within a specific range.

### 2. Encoding technique

In [23]:
unique_counts = adult1.nunique()
print(unique_counts)

age                  72
workclass             7
fnlwgt            20263
education            16
education_num        16
marital_status        7
occupation           14
relationship          6
race                  5
sex                   2
capital_gain        118
capital_loss         90
hours_per_week       94
native_country       41
income                2
dtype: int64


##### Label encoding of variable having more than 5 categories

In [24]:
label_encoder = preprocessing.LabelEncoder()
adult1['workclass']= label_encoder.fit_transform(adult1['workclass'])
adult1['education']= label_encoder.fit_transform(adult1['education'])
adult1['marital_status']= label_encoder.fit_transform(adult1['marital_status'])
adult1['occupation']= label_encoder.fit_transform(adult1['occupation'])
adult1['relationship']= label_encoder.fit_transform(adult1['relationship'])
adult1['native_country']= label_encoder.fit_transform(adult1['native_country'])

##### One-hot encoding for variable having les than 5 categories

In [25]:
adult1= pd.get_dummies(adult1, columns=['race','sex','income'], drop_first=True, dtype=int)

One-Hot Encoding is ideal for categorical variables with fewer categories, avoiding ordinal assumptions and improving model performance but can lead to high dimensionality and sparsity issues. Label Encoding is compact and efficient for high-cardinality variables but may introduce misleading ordinal relationships and misinterpretation by models. Choose One-Hot Encoding for nominal variables with few categories and Label Encoding for high-cardinality or ordinal variables, keeping in mind the potential implications for model performance.

### 3. Feature engineering

Create 2 new features age_group which divide continuous age column into category and second one is capital_gain_loss_ratio which gives us idead abouts both the gain and loss column i.e; if someone is net gainer or loser.

In [26]:
bins = [0, 24, 34, 44, 54, 64, 100]
labels = ['17-24', '25-34', '35-44', '45-54', '55-64', '65+']
adult1['age_group'] = pd.cut(adult1['age'], bins=bins, labels=labels, right=False)

In [27]:
adult1['capital_gain_loss_ratio'] = adult1['capital_gain'] / (adult1['capital_loss'] + 1)

In [28]:
adult1['age_group']= label_encoder.fit_transform(adult1['age_group'])

In [29]:
adult1.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K,age_group,capital_gain_loss_ratio
0,39,5,77516,9,13,4,0,1,2174,0,40,38,0,0,0,1,1,0,2,2174.0
1,50,4,83311,9,13,2,3,0,0,0,13,38,0,0,0,1,1,0,3,0.0
2,38,2,215646,11,9,0,5,1,0,0,40,38,0,0,0,1,1,0,2,0.0
3,53,2,234721,1,7,2,5,0,0,0,40,38,0,1,0,0,1,0,3,0.0
4,28,2,338409,9,13,2,9,5,0,0,40,4,0,1,0,0,0,0,1,0.0
5,37,2,284582,12,14,2,3,5,0,0,40,38,0,0,0,1,0,0,2,0.0
6,49,2,160187,6,5,3,7,1,0,0,16,22,0,1,0,0,0,0,3,0.0
7,52,4,209642,11,9,2,3,0,0,0,45,38,0,0,0,1,1,1,3,0.0
8,31,2,45781,12,14,4,9,1,14084,0,50,38,0,0,0,1,0,1,1,14084.0
9,42,2,159449,9,13,2,3,0,5178,0,40,38,0,0,0,1,1,1,2,5178.0


Age Group: Categorizing age into distinct groups helps capture life stage-related patterns, improving model performance by simplifying complex relationships.

Capital Gain/Loss Ratio: This feature normalizes financial gain relative to loss, providing insights into an individual’s financial situation and its potential impact on income.

These features enhance predictive accuracy by highlighting meaningful patterns and relationships in the data.

##### Apply log transformation to skewed numerical variable

In [30]:
adult1['capital_gain_log'] = np.log1p(adult1['capital_gain'])
adult1['capital_loss_log'] = np.log1p(adult1['capital_loss'])
adult1['capital_gain_loss_ratio_log'] = np.log1p(adult1['capital_gain_loss_ratio'])

Applying a log transformation to the capital_gain, capital_loss, capital_gain_loss_ratio features normalizes its skewed distribution, reduces the impact of outliers, and improves model performance by making the data more normally distributed.

In [31]:
adult1.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K,age_group,capital_gain_loss_ratio,capital_gain_log,capital_loss_log,capital_gain_loss_ratio_log
0,39,5,77516,9,13,4,0,1,2174,0,...,0,0,1,1,0,2,2174.0,7.684784,0.0,7.684784
1,50,4,83311,9,13,2,3,0,0,0,...,0,0,1,1,0,3,0.0,0.0,0.0,0.0
2,38,2,215646,11,9,0,5,1,0,0,...,0,0,1,1,0,2,0.0,0.0,0.0,0.0
3,53,2,234721,1,7,2,5,0,0,0,...,1,0,0,1,0,3,0.0,0.0,0.0,0.0
4,28,2,338409,9,13,2,9,5,0,0,...,1,0,0,0,0,1,0.0,0.0,0.0,0.0
5,37,2,284582,12,14,2,3,5,0,0,...,0,0,1,0,0,2,0.0,0.0,0.0,0.0
6,49,2,160187,6,5,3,7,1,0,0,...,1,0,0,0,0,3,0.0,0.0,0.0,0.0
7,52,4,209642,11,9,2,3,0,0,0,...,0,0,1,1,1,3,0.0,0.0,0.0,0.0
8,31,2,45781,12,14,4,9,1,14084,0,...,0,0,1,0,1,1,14084.0,9.552866,0.0,9.552866
9,42,2,159449,9,13,2,3,0,5178,0,...,0,0,1,1,1,2,5178.0,8.552367,0.0,8.552367


In [32]:
adult1.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K,age_group,capital_gain_loss_ratio,capital_gain_log,capital_loss_log,capital_gain_loss_ratio_log
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,...,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,2.199324,189793.8,10.333764,10.121312,2.580134,5.95985,1.418341,1092.007858,88.372489,...,0.093396,0.007659,0.85979,0.675685,0.248922,1.977588,1092.007858,0.743647,0.355226,0.743647
std,13.134665,0.953925,105653.0,3.812292,2.549995,1.498016,4.029566,1.601338,7406.346497,404.29837,...,0.290991,0.087179,0.34721,0.468126,0.432396,1.325018,7406.346497,2.470518,1.59501,2.470518
min,17.0,0.0,13769.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,2.0,117627.2,9.0,9.0,2.0,2.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,37.0,2.0,178425.0,11.0,10.0,2.0,6.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0
75%,47.0,2.0,237628.5,12.0,13.0,4.0,9.0,3.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0
max,90.0,6.0,1484705.0,15.0,16.0,6.0,13.0,5.0,99999.0,4356.0,...,1.0,1.0,1.0,1.0,1.0,5.0,99999.0,11.512925,8.379539,11.512925


### 4. Feature selection

#### Isolation Forest

In [34]:
model = IsolationForest(random_state=10,contamination=0.1)
model.fit(adult1)

In [35]:
adult1['outlier'] = model.fit_predict(adult1)

##### Remove outliers

In [36]:
adult_cleaned = adult1[adult1['outlier'] == 1].drop(columns=['outlier'])
adult_cleaned.reset_index(drop=True, inplace=True)
adult_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K,age_group,capital_gain_loss_ratio,capital_gain_log,capital_loss_log,capital_gain_loss_ratio_log
0,50,4,83311,9,13,2,3,0,0,0,...,0,0,1,1,0,3,0.0,0.0,0.0,0.0
1,38,2,215646,11,9,0,5,1,0,0,...,0,0,1,1,0,2,0.0,0.0,0.0,0.0
2,53,2,234721,1,7,2,5,0,0,0,...,1,0,0,1,0,3,0.0,0.0,0.0,0.0
3,37,2,284582,12,14,2,3,5,0,0,...,0,0,1,0,0,2,0.0,0.0,0.0,0.0
4,52,4,209642,11,9,2,3,0,0,0,...,0,0,1,1,1,3,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27140,22,2,310152,15,10,4,10,1,0,0,...,0,0,1,1,0,0,0.0,0.0,0.0,0.0
27141,27,2,257302,7,12,2,12,5,0,0,...,0,0,1,0,0,1,0.0,0.0,0.0,0.0
27142,40,2,154374,11,9,2,6,0,0,0,...,0,0,1,1,1,2,0.0,0.0,0.0,0.0
27143,58,2,151910,11,9,6,0,4,0,0,...,0,0,1,0,0,4,0.0,0.0,0.0,0.0


#### Apply PPS (Predictive power score)

In [42]:
pps_matrix = pps.matrix(adult_cleaned)
pps_matrix

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,age,age,1.000000,predict_itself,True,,0.000000,1.000000,
1,age,workclass,0.000000,regression,True,mean absolute error,0.414600,0.531676,DecisionTreeRegressor()
2,age,fnlwgt,0.000000,regression,True,mean absolute error,75019.788000,76343.628423,DecisionTreeRegressor()
3,age,education,0.000000,regression,True,mean absolute error,2.611600,2.684886,DecisionTreeRegressor()
4,age,education_num,0.000000,regression,True,mean absolute error,1.758000,1.795076,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
524,capital_gain_loss_ratio_log,age_group,0.000000,regression,True,mean absolute error,1.038000,1.051437,DecisionTreeRegressor()
525,capital_gain_loss_ratio_log,capital_gain_loss_ratio,0.989886,regression,True,mean absolute error,164.900800,1.667800,DecisionTreeRegressor()
526,capital_gain_loss_ratio_log,capital_gain_log,0.999376,regression,True,mean absolute error,0.292947,0.000183,DecisionTreeRegressor()
527,capital_gain_loss_ratio_log,capital_loss_log,0.000000,regression,True,mean absolute error,0.225293,0.436499,DecisionTreeRegressor()


#### Correlation matrix

In [39]:
corr_matrix= adult_cleaned.corr()
corr_matrix

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,...,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K,age_group,capital_gain_loss_ratio,capital_gain_log,capital_loss_log,capital_gain_loss_ratio_log
age,1.0,0.075397,-0.07889,0.004708,0.04179,-0.301956,-0.008306,-0.262369,0.05028,0.013466,...,-0.03238,-0.032645,0.048115,0.088522,0.227882,0.972936,0.05028,0.044176,0.013081,0.044176
workclass,0.075397,1.0,-0.036378,0.015401,0.023271,-0.03132,0.007911,-0.06675,-0.023757,-0.028582,...,-0.05773,-0.016341,0.058529,0.070515,-0.005568,0.072427,-0.023757,-0.019863,-0.029916,-0.019863
fnlwgt,-0.07889,-0.036378,1.0,-0.027741,-0.048194,0.034366,-0.000415,0.006906,-0.002771,-0.017344,...,0.103049,0.003754,-0.050843,0.027363,-0.010218,-0.07568,-0.002771,-0.004783,-0.017211,-0.004783
education,0.004708,0.015401,-0.027741,1.0,0.330427,-0.028613,-0.036324,-0.002668,0.014428,0.018132,...,0.014091,-0.002923,-0.016844,-0.041512,0.066218,-0.002396,0.014428,0.015194,0.018507,0.015194
education_num,0.04179,0.023271,-0.048194,0.330427,1.0,-0.044305,0.085077,-0.083597,0.058537,0.074195,...,-0.049972,-0.017857,0.033132,-0.010201,0.301557,0.047485,0.058537,0.04225,0.071616,0.04225
marital_status,-0.301956,-0.03132,0.034366,-0.028613,-0.044305,1.0,0.02474,0.192579,-0.060698,-0.031807,...,0.081519,0.015331,-0.081641,-0.119387,-0.190658,-0.304917,-0.060698,-0.060492,-0.032064,-0.060492
occupation,-0.008306,0.007911,-0.000415,-0.036324,0.085077,0.02474,1.0,-0.052955,-0.000206,0.002477,...,-0.003477,0.001915,0.003046,0.061568,0.045489,-0.007214,-0.000206,0.000602,0.00231,0.000602
relationship,-0.262369,-0.06675,0.006906,-0.002668,-0.083597,0.192579,-0.052955,1.0,-0.134148,-0.096809,...,0.133933,0.00473,-0.125756,-0.574602,-0.271916,-0.261652,-0.134148,-0.131978,-0.090213,-0.131978
capital_gain,0.05028,-0.023757,-0.002771,0.014428,0.058537,-0.060698,-0.000206,-0.134148,1.0,-0.027916,...,-0.050586,-0.011914,0.061227,0.110932,0.198,0.052347,1.0,0.931882,-0.028341,0.931882
capital_loss,0.013466,-0.028582,-0.017344,0.018132,0.074195,-0.031807,0.002477,-0.096809,-0.027916,1.0,...,-0.049999,-0.011798,0.060549,0.080521,0.137749,0.015751,-0.027916,-0.030696,0.98971,-0.030696


Complementary Insights: The PPS matrix provides a broader view of predictive power that includes non-linear relationships, while the correlation matrix focuses on linear relationships. Comparing both helps in understanding the full spectrum of feature interactions and can guide model selection and feature engineering strategies.