In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('/content/adult_with_headers.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
# step 1

In [5]:
summary_stats = df.describe(include='all')


summary_stats



Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [6]:
missing_values = df.isnull().sum()

missing_values

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [7]:
data_types = df.dtypes
data_types

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
sex,object


In [8]:
# Selecting numerical features
numerical_features = df.select_dtypes(include=['int64']).columns

In [9]:
# Apply Standard Scaling
scaler_standard = StandardScaler()
standard_scaled_data = scaler_standard.fit_transform(df[numerical_features])

In [10]:
# Apply Min-Max Scaling
scaler_minmax = MinMaxScaler()
minmax_scaled_data = scaler_minmax.fit_transform(df[numerical_features])

In [11]:
# Converting scaled data back to DataFrame for easier interpretation
df_standard_scaled = pd.DataFrame(standard_scaled_data, columns=numerical_features)
# Displaying the first few rows of each scaled DataFrame
df_standard_scaled.head()




Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [12]:
df_minmax_scaled = pd.DataFrame(minmax_scaled_data, columns=numerical_features)
df_minmax_scaled.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [None]:
# step 2


In [13]:
# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns

In [14]:
# Count unique values in each categorical feature
unique_counts = df[categorical_features].nunique()
unique_counts

Unnamed: 0,0
workclass,9
education,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2
native_country,42
income,2


In [15]:
# Separate categorical features based on the number of unique values
one_hot_encoder_features = unique_counts[unique_counts <= 5].index
label_encoding_features = unique_counts[unique_counts > 5].index




In [16]:
one_hot_encoder_features

Index(['race', 'sex', 'income'], dtype='object')

In [17]:
label_encoding_features

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'native_country'],
      dtype='object')

In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder




In [19]:

# Initialize encoders
one_hot_encoder = OneHotEncoder(sparse_output=False,drop='first')  # Drop first to avoid dummy variable trap
label_encoder = LabelEncoder()



In [20]:
# Apply One-Hot Encoding
one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform([one_hot_encoder_features]),
                               columns=one_hot_encoder.get_feature_names_out(one_hot_encoder_features))


In [21]:
# Apply Label Encoding
label_encoded_df = df[label_encoding_features].apply(label_encoder.fit_transform)


In [22]:
# Combine encoded data with the original DataFrame (dropping original encoded columns)
df_encoded = pd.concat([df.drop(columns=one_hot_encoder_features.tolist() + label_encoding_features.tolist()),
                        one_hot_encoded, label_encoded_df], axis=1)



In [23]:
df_encoded.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass,education,marital_status,occupation,relationship,native_country
0,39,77516,13,2174,0,40,7,9,4,1,1,39
1,50,83311,13,0,0,13,6,9,2,4,0,39
2,38,215646,9,0,0,40,4,11,0,6,1,39
3,53,234721,7,0,0,40,4,1,2,6,0,39
4,28,338409,13,0,0,40,4,9,2,10,5,5


In [None]:
# step 3

In [24]:
# Create new features based on the instructions

In [25]:
# Feature 1: Age Bucket
df['age_bucket'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['<25', '25-35', '35-45', '45-55', '55-65', '>65'])

In [26]:
# Feature 2: Hours per Week Bucket
df['hours_per_week_bucket'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, 80, 100], labels=['<20', '20-40', '40-60', '60-80', '>80'])

In [27]:
# Apply a log transformation to a skewed numerical feature (capital-gain)
df['capital_gain_log'] = df['capital_gain'].apply(lambda x: np.log(x + 1))  # Add 1 to avoid log(0)


In [28]:
# Isolation Forest to detect and remove outliers
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df.select_dtypes(include=[np.number]))

In [29]:
# Remove outliers (keeping only inliers)
df_no_outliers = df[df['outlier'] == 1].drop(columns=['outlier'])

In [30]:

# Correlation matrix for alternative analysis
corr_matrix = df_no_outliers.select_dtypes(include=['number']).corr()

In [31]:

# Display the newly created features, transformed data, and correlation matrix
df[['age', 'age_bucket', 'hours_per_week', 'hours_per_week_bucket', 'capital_gain', 'capital_gain_log']].head(), corr_matrix


(   age age_bucket  hours_per_week hours_per_week_bucket  capital_gain  \
 0   39      35-45              40                 20-40          2174   
 1   50      45-55              13                   <20             0   
 2   38      35-45              40                 20-40             0   
 3   53      45-55              40                 20-40             0   
 4   28      25-35              40                 20-40             0   
 
    capital_gain_log  
 0          7.684784  
 1          0.000000  
 2          0.000000  
 3          0.000000  
 4          0.000000  ,
                        age    fnlwgt  education_num  capital_gain  \
 age               1.000000 -0.081131       0.026841      0.057457   
 fnlwgt           -0.081131  1.000000      -0.042110     -0.017505   
 education_num     0.026841 -0.042110       1.000000      0.088089   
 capital_gain      0.057457 -0.017505       0.088089      1.000000   
 capital_loss      0.020255 -0.022668       0.051260     -0.03844

In [None]:
# step 4

In [32]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df.select_dtypes(include=[np.number]))

# Remove outliers
df_no_outliers = df[df['outlier'] == 1].drop(columns=['outlier'])


In [33]:
correlation_matrix = df_no_outliers.select_dtypes(include=['number']).corr()
print(correlation_matrix)


                       age    fnlwgt  education_num  capital_gain  \
age               1.000000 -0.081129       0.026878      0.057395   
fnlwgt           -0.081129  1.000000      -0.042128     -0.017581   
education_num     0.026878 -0.042128       1.000000      0.088408   
capital_gain      0.057395 -0.017581       0.088408      1.000000   
capital_loss      0.020260 -0.022667       0.051249     -0.038424   
hours_per_week    0.082586 -0.017293       0.131428      0.050879   
capital_gain_log  0.049950 -0.019929       0.073062      0.903731   

                  capital_loss  hours_per_week  capital_gain_log  
age                   0.020260        0.082586          0.049950  
fnlwgt               -0.022667       -0.017293         -0.019929  
education_num         0.051249        0.131428          0.073062  
capital_gain         -0.038424        0.050879          0.903731  
capital_loss          1.000000        0.020485         -0.043941  
hours_per_week        0.020485        1.00000

In [34]:
!pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py) ... [?25l[?25hdone
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13166 sha256=073c6a239b7e21372c48b3beb5427d77296c4df25ec5551921b112730d4656d7
  Stored in directory: /root/.cache/pip/wheels/7e/45/08/bb444e1bca6a2bc3795707de9edd87ec1976dd8b3570fa7abf
Successfully built ppscore
Installing collected packages: pandas, ppscore
  Attempting uninstall: pandas
    Found existing installation: pandas

In [35]:
import ppscore as pps


In [36]:
pps_matrix = pps.matrix(df_no_outliers)
print(pps_matrix[['x', 'y', 'ppscore']])



                    x                      y   ppscore
0                 age                    age  1.000000
1                 age              workclass  0.015215
2                 age                 fnlwgt  0.000000
3                 age              education  0.059009
4                 age          education_num  0.000000
..                ...                    ...       ...
319  capital_gain_log         native_country  0.000000
320  capital_gain_log                 income  0.196718
321  capital_gain_log             age_bucket  0.000000
322  capital_gain_log  hours_per_week_bucket  0.013580
323  capital_gain_log       capital_gain_log  1.000000

[324 rows x 3 columns]




In [37]:
numerical_df = df_no_outliers.select_dtypes(include=['number'])

In [38]:
correlation_matrix = numerical_df.corr()
print(correlation_matrix)

                       age    fnlwgt  education_num  capital_gain  \
age               1.000000 -0.081129       0.026878      0.057395   
fnlwgt           -0.081129  1.000000      -0.042128     -0.017581   
education_num     0.026878 -0.042128       1.000000      0.088408   
capital_gain      0.057395 -0.017581       0.088408      1.000000   
capital_loss      0.020260 -0.022667       0.051249     -0.038424   
hours_per_week    0.082586 -0.017293       0.131428      0.050879   
capital_gain_log  0.049950 -0.019929       0.073062      0.903731   

                  capital_loss  hours_per_week  capital_gain_log  
age                   0.020260        0.082586          0.049950  
fnlwgt               -0.022667       -0.017293         -0.019929  
education_num         0.051249        0.131428          0.073062  
capital_gain         -0.038424        0.050879          0.903731  
capital_loss          1.000000        0.020485         -0.043941  
hours_per_week        0.020485        1.00000