1. Data Exploration and Preprocessing:
Load the dataset and conduct basic data exploration (summary statistics, missing values, data types).
Handle missing values as per the best practices (imputation, removal, etc.).
Apply scaling techniques to numerical features:
Standard Scaling
Min-Max Scaling
Discuss the scenarios where each scaling technique is preferred and why.


In [40]:
import pandas as pd

In [41]:
data=pd.read_csv("adult_with_headers.csv")
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [42]:
#dropping irrelavant column 
data.drop(columns=["fnlwgt","capital_gain","capital_loss"],inplace=True)
data

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K


In [43]:
data.describe()

Unnamed: 0,age,education_num,hours_per_week
count,32561.0,32561.0,32561.0
mean,38.581647,10.080679,40.437456
std,13.640433,2.57272,12.347429
min,17.0,1.0,1.0
25%,28.0,9.0,40.0
50%,37.0,10.0,40.0
75%,48.0,12.0,45.0
max,90.0,16.0,99.0


In [44]:
# verifying missing values 
data.isnull().sum()

age               0
workclass         0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [45]:
#verifying duplicated values 
data.duplicated().sum()

4243

In [46]:
#dropping duplicated values 
data.drop_duplicates(inplace=True)
data

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
32554,53,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,>50K
32555,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,40,United-States,<=50K
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K


In [47]:
data.duplicated().sum()

0

In [48]:
#identify categorical column 
categorical_col=["workclass" ,"education","occupation"]

In [49]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import warnings 
warnings.filterwarnings("ignore")
one_hot_enc=OneHotEncoder(sparse=False,drop='first')

In [50]:
encoded=one_hot_enc.fit_transform(data[categorical_col])
encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
# create the dataframe of encoded 
data_encoded=pd.DataFrame(encoded,columns=one_hot_enc.get_feature_names_out(categorical_col))
data_encoded

Unnamed: 0,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28313,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28314,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28315,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28316,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# concate the encoded data into original data
data=pd.concat([data.drop(columns=categorical_col),data_encoded],axis=1)
data

Unnamed: 0,age,education_num,marital_status,relationship,race,sex,hours_per_week,native_country,income,workclass_ Federal-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,39.0,13.0,Never-married,Not-in-family,White,Male,40.0,United-States,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50.0,13.0,Married-civ-spouse,Husband,White,Male,13.0,United-States,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38.0,9.0,Divorced,Not-in-family,White,Male,40.0,United-States,<=50K,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,53.0,7.0,Married-civ-spouse,Husband,Black,Male,40.0,United-States,<=50K,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28.0,13.0,Married-civ-spouse,Wife,Black,Female,40.0,Cuba,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28309,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28311,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
28312,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28315,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
#Use Label Encoding for categorical variables with more than 5 categories.
lab_enc=LabelEncoder()

In [54]:
data["age"]=lab_enc.fit_transform(data["age"])
data["education_num"]=lab_enc.fit_transform(data["education_num"])
data["marital_status"]=lab_enc.fit_transform(data["marital_status"])
data["relationship"]=lab_enc.fit_transform(data["relationship"])
data["hours_per_week"]=lab_enc.fit_transform(data["hours_per_week"])
data["race"]=lab_enc.fit_transform(data["race"])
data["sex"]=lab_enc.fit_transform(data["sex"])
data["income"]=lab_enc.fit_transform(data["income"])
data["native_country"]=lab_enc.fit_transform(data["native_country"])
data

Unnamed: 0,age,education_num,marital_status,relationship,race,sex,hours_per_week,native_country,income,workclass_ Federal-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,22,12,4,1,4,1,39,39,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,33,12,2,0,4,1,12,39,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,21,8,0,1,4,1,39,39,0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,36,6,2,0,2,1,39,39,0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,12,2,5,2,0,39,5,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28309,73,16,7,6,5,2,94,42,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28311,73,16,7,6,5,2,94,42,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
28312,73,16,7,6,5,2,94,42,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
28315,73,16,7,6,5,2,94,42,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [55]:
#Apply scaling techniques to numerical features:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [56]:
# initializing the scaler 
sta_sca=StandardScaler()
min_max=MinMaxScaler()

In [57]:
# identitfy how many numerical column are there 
numerical_col=data.select_dtypes(include=["float64","int64"]).columns
numerical_col

Index(['age', 'education_num', 'hours_per_week', 'workclass_ Federal-gov',
       'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 11th',
       'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th',
       'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm',
       'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate',
       'education_ HS-grad', 'education_ Masters', 'education_ Preschool',
       'education_ Prof-school', 'education_ Some-college',
       'occupation_ Adm-clerical', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners',
       'occupation_ Machine-op-inspct', 'occupation_ Other-service',
       'occupation_ Priv-house-serv', 'occupation_ Prof-specialty',
       'occ

In [58]:
#Fit the scaler to the numerical data and transform it
scaled_column=sta_sca.fit_transform(data[numerical_col])
scaled_column

array([[-0.27943738,  0.66020542, -0.30856416, ..., -0.35522966,
        -0.17687002, -0.22950952],
       [ 0.25993093,  0.66020542, -1.61547584, ..., -0.35522966,
        -0.17687002, -0.22950952],
       [-0.32847087, -0.55302314, -0.30856416, ..., -0.35522966,
        -0.17687002, -0.22950952],
       ...,
       [ 2.22127026,  1.87343399,  2.35366332, ..., -0.35522966,
         5.65386945, -0.22950952],
       [ 2.22127026,  1.87343399,  2.35366332, ..., -0.35522966,
         5.65386945, -0.22950952],
       [ 2.22127026,  1.87343399,  2.35366332, ..., -0.35522966,
        -0.17687002, -0.22950952]])

In [59]:
# create a dataframe of this scaler_column
scaled_df=pd.DataFrame(scaled_column,columns=numerical_col)
scaled_df

Unnamed: 0,age,education_num,hours_per_week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,-0.279437,0.660205,-0.308564,-0.184887,-0.27604,-0.015724,-1.422267,-0.198549,-0.307821,4.624475,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
1,0.259931,0.660205,-1.615476,-0.184887,-0.27604,-0.015724,-1.422267,-0.198549,3.248638,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
2,-0.328471,-0.553023,-0.308564,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
3,0.407031,-1.159637,-0.308564,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
4,-0.818806,0.660205,-0.308564,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,2.554874,-0.150352,-0.35523,-0.176870,-0.22951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31723,2.221270,1.873434,2.353663,-0.184887,-0.27604,-0.015724,-1.422267,-0.198549,3.248638,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31724,2.221270,1.873434,2.353663,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,2.81508,-0.176870,-0.22951
31725,2.221270,1.873434,2.353663,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951
31726,2.221270,1.873434,2.353663,-0.184887,-0.27604,-0.015724,0.703103,-0.198549,-0.307821,-0.216241,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951


In [60]:
#drop the original numerical_col and concanate this scaled_df into org datasets
data.drop(columns=numerical_col,inplace=True)
data

Unnamed: 0,marital_status,relationship,race,sex,native_country,income
0,4,1,4,1,39,0
1,2,0,4,1,39,0
2,0,1,4,1,39,0
3,2,0,2,1,39,0
4,2,5,2,0,5,0
...,...,...,...,...,...,...
28309,7,6,5,2,42,2
28311,7,6,5,2,42,2
28312,7,6,5,2,42,2
28315,7,6,5,2,42,2


In [61]:
data=pd.concat([data,scaled_df],axis=1)
data

Unnamed: 0,marital_status,relationship,race,sex,native_country,income,age,education_num,hours_per_week,workclass_ Federal-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,4.0,1.0,4.0,1.0,39.0,0.0,-0.279437,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
1,2.0,0.0,4.0,1.0,39.0,0.0,0.259931,0.660205,-1.615476,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
2,0.0,1.0,4.0,1.0,39.0,0.0,-0.328471,-0.553023,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
3,2.0,0.0,2.0,1.0,39.0,0.0,0.407031,-1.159637,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
4,2.0,5.0,2.0,0.0,5.0,0.0,-0.818806,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,2.554874,-0.150352,-0.35523,-0.176870,-0.22951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31704,,,,,,,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,2.81508,-0.176870,-0.22951
31708,,,,,,,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,4.018959,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31720,,,,,,,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,2.920319,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31725,,,,,,,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951


In [62]:
# filling nan values with [0]
cleaned_data=data.fillna(0)
cleaned_data

Unnamed: 0,marital_status,relationship,race,sex,native_country,income,age,education_num,hours_per_week,workclass_ Federal-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,4.0,1.0,4.0,1.0,39.0,0.0,-0.279437,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
1,2.0,0.0,4.0,1.0,39.0,0.0,0.259931,0.660205,-1.615476,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
2,0.0,1.0,4.0,1.0,39.0,0.0,-0.328471,-0.553023,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
3,2.0,0.0,2.0,1.0,39.0,0.0,0.407031,-1.159637,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
4,2.0,5.0,2.0,0.0,5.0,0.0,-0.818806,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,2.554874,-0.150352,-0.35523,-0.176870,-0.22951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31704,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,2.81508,-0.176870,-0.22951
31708,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,4.018959,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31720,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,2.920319,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31725,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951


In [63]:
data=pd.concat([data,cleaned_data],axis=0)
data

Unnamed: 0,marital_status,relationship,race,sex,native_country,income,age,education_num,hours_per_week,workclass_ Federal-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,4.0,1.0,4.0,1.0,39.0,0.0,-0.279437,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
1,2.0,0.0,4.0,1.0,39.0,0.0,0.259931,0.660205,-1.615476,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
2,0.0,1.0,4.0,1.0,39.0,0.0,-0.328471,-0.553023,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
3,2.0,0.0,2.0,1.0,39.0,0.0,0.407031,-1.159637,-0.308564,-0.184887,...,-0.187122,4.842957,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
4,2.0,5.0,2.0,0.0,5.0,0.0,-0.818806,0.660205,-0.308564,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,2.554874,-0.150352,-0.35523,-0.176870,-0.22951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31704,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,2.81508,-0.176870,-0.22951
31708,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,4.018959,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31720,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,2.920319,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951
31725,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.187122,-0.206485,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951


Feature Engineering:
Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.
Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.


In [64]:
#feature1
data['Total_income']=data["income"]*data["hours_per_week"]
# rationale 
print("Total income provides a comprehensive measure of an individual's earning capacity.It can help the model understand the overall financial status of individuals more accurately.")

Total income provides a comprehensive measure of an individual's earning capacity.It can help the model understand the overall financial status of individuals more accurately.


In [65]:
# feature2 
def categorize_exp(age):
    if age <30 :
        return "Junior"
    elif 30<= age <50:
        "Mid_level"
    else:
        "Senior"
        
data["Experience_level"]=data['age'].apply(categorize_exp)
data

Unnamed: 0,marital_status,relationship,race,sex,native_country,income,age,education_num,hours_per_week,workclass_ Federal-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,Total_income,Experience_level
0,4.0,1.0,4.0,1.0,39.0,0.0,-0.279437,0.660205,-0.308564,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,-0.0,Junior
1,2.0,0.0,4.0,1.0,39.0,0.0,0.259931,0.660205,-1.615476,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,-0.0,Junior
2,0.0,1.0,4.0,1.0,39.0,0.0,-0.328471,-0.553023,-0.308564,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,-0.0,Junior
3,2.0,0.0,2.0,1.0,39.0,0.0,0.407031,-1.159637,-0.308564,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,-0.0,Junior
4,2.0,5.0,2.0,0.0,5.0,0.0,-0.818806,0.660205,-0.308564,-0.184887,...,-0.248821,-0.342428,-0.072237,2.554874,-0.150352,-0.35523,-0.176870,-0.22951,-0.0,Junior
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31704,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,2.81508,-0.176870,-0.22951,0.0,Junior
31708,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,4.018959,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,0.0,Junior
31720,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.248821,2.920319,-0.072237,-0.391409,-0.150352,-0.35523,-0.176870,-0.22951,0.0,Junior
31725,0.0,0.0,0.0,0.0,0.0,0.0,2.221270,1.873434,2.353663,-0.184887,...,-0.248821,-0.342428,-0.072237,-0.391409,-0.150352,-0.35523,5.653869,-0.22951,0.0,Junior


In [66]:
print("Experience level provides insight into an individual's career stage and potential earning capacity.")

Experience level provides insight into an individual's career stage and potential earning capacity.


In [67]:
#Feature Selection:
#Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.
#Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.


In [68]:
from sklearn.ensemble import IsolationForest 

In [69]:
# taking only numerical data
num_data=data.select_dtypes(include=["number"])

In [70]:
model=IsolationForest()

In [71]:
num_data.dropna(inplace=True)

In [72]:
#fit the model 
model.fit(num_data)

In [73]:
# predict the outliers 
outliers=model.predict(num_data)

In [74]:
# drop the outliers
data=num_data[outliers==1]