In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,LabelEncoder,OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [89]:
df = pd.read_csv("adult_with_headers.csv")

In [90]:
df.shape

(32561, 15)

In [91]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [92]:
df.duplicated().sum()

24

In [93]:
df.drop_duplicates(inplace=True,ignore_index=True)

In [94]:
df.nunique()

Unnamed: 0,0
age,73
workclass,9
fnlwgt,21648
education,16
education_num,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2


In [95]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [96]:
df['capital_diff'] = df['capital_gain'] - df['capital_loss']
df['age_group'] = pd.cut(df['age'],
                         bins=[0, 25, 45, 65, 100],
                         labels=['Young', 'Adult', 'Senior', 'Old'])
df['work_hour_level'] = pd.cut(df['hours_per_week'],
                               bins=[0, 20, 40, 60, 100],
                               labels=['Low', 'Medium', 'High', 'Very High'])
df['is_married'] = df['marital_status'].apply(lambda x: 1 if 'Married' in x else 0)

In [97]:
df.drop(df[['capital_gain','capital_loss','hours_per_week','marital_status','age']],axis=1,inplace=True)

In [98]:
df.head()

Unnamed: 0,workclass,fnlwgt,education,education_num,occupation,relationship,race,sex,native_country,income,capital_diff,age_group,work_hour_level,is_married
0,State-gov,77516,Bachelors,13,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,2174,Adult,Medium,0
1,Self-emp-not-inc,83311,Bachelors,13,Exec-managerial,Husband,White,Male,United-States,<=50K,0,Senior,Low,1
2,Private,215646,HS-grad,9,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,0,Adult,Medium,0
3,Private,234721,11th,7,Handlers-cleaners,Husband,Black,Male,United-States,<=50K,0,Senior,Medium,1
4,Private,338409,Bachelors,13,Prof-specialty,Wife,Black,Female,Cuba,<=50K,0,Adult,Medium,1


In [99]:
df.workclass.value_counts()

Unnamed: 0,workclass
Private,22673
Self-emp-not-inc,2540
Local-gov,2093
?,1836
State-gov,1298
Self-emp-inc,1116
Federal-gov,960
Without-pay,14
Never-worked,7


In [100]:
df['workclass']=df['workclass'].str.strip()
df['workclass'].replace('?',np.nan,inplace=True)
df.dropna(subset=['workclass'],inplace=True)

In [101]:
df['occupation']=df['occupation'].str.strip()
df['occupation'].replace('?',np.nan,inplace=True)
df.dropna(subset=['occupation'],inplace=True)

In [102]:
df.relationship.value_counts()

Unnamed: 0,relationship
Husband,12698
Not-in-family,7852
Own-child,4521
Unmarried,3270
Wife,1435
Other-relative,918


In [103]:
df.duplicated().sum()

69

In [104]:
df.drop_duplicates(inplace=True,ignore_index=True)

In [105]:
label = LabelEncoder()

In [106]:
df.columns = df.columns.str.strip()

In [107]:
(df['age_group'])=label.fit_transform(df['age_group'])
(df['race'])=label.fit_transform(df['race'])
(df['work_hour_level'])=label.fit_transform(df['work_hour_level'])
(df['sex'])=label.fit_transform(df['sex'])
(df['income'])=label.fit_transform(df['income'])

In [108]:
df.head()

Unnamed: 0,workclass,fnlwgt,education,education_num,occupation,relationship,race,sex,native_country,income,capital_diff,age_group,work_hour_level,is_married
0,State-gov,77516,Bachelors,13,Adm-clerical,Not-in-family,4,1,United-States,0,2174,0,2,0
1,Self-emp-not-inc,83311,Bachelors,13,Exec-managerial,Husband,4,1,United-States,0,0,2,1,1
2,Private,215646,HS-grad,9,Handlers-cleaners,Not-in-family,4,1,United-States,0,0,0,2,0
3,Private,234721,11th,7,Handlers-cleaners,Husband,2,1,United-States,0,0,2,2,1
4,Private,338409,Bachelors,13,Prof-specialty,Wife,2,0,Cuba,0,0,0,2,1


In [109]:
df['workclass'].drop_duplicates(inplace=True)

In [110]:
df=pd.concat([df.drop('workclass',axis=1), pd.get_dummies(df['workclass'],drop_first=True,sparse=True,dtype=int)],axis=1)
df=pd.concat([df.drop('education',axis=1), pd.get_dummies(df['education'],drop_first=True,sparse=True,dtype=int)],axis=1)
df=pd.concat([df.drop('education_num',axis=1), pd.get_dummies(df['education_num'],drop_first=True,sparse=True,dtype=int)],axis=1)
df=pd.concat([df.drop('relationship',axis=1), pd.get_dummies(df['relationship'],drop_first=True,sparse=True,dtype=int)],axis=1)
df=pd.concat([df.drop('native_country',axis=1), pd.get_dummies(df['native_country'],drop_first=True,sparse=True,dtype=int)],axis=1)

In [111]:
df=pd.concat([df.drop('occupation',axis=1), pd.get_dummies(df['occupation'],drop_first=True,sparse=True,dtype=int)],axis=1)

In [112]:
df.head()

Unnamed: 0,fnlwgt,race,sex,income,capital_diff,age_group,work_hour_level,is_married,Local-gov,Private,...,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,77516,4,1,0,2174,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,83311,4,1,0,0,2,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,215646,4,1,0,0,0,2,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,234721,2,1,0,0,2,2,1,0,1,...,0,1,0,0,0,0,0,0,0,0
4,338409,2,0,0,0,0,2,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [113]:
df.shape

(30625, 103)

In [114]:
minax = MinMaxScaler()

In [115]:
df[['fnlwgt','capital_diff','race','work_hour_level','age_group']]=minax.fit_transform(df[['fnlwgt','age_group','capital_diff','race','work_hour_level']]).round(3)

In [116]:
df.head()

Unnamed: 0,fnlwgt,race,sex,income,capital_diff,age_group,work_hour_level,is_married,Local-gov,Private,...,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
0,0.043,0.063,1,0,0.0,0.667,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.047,0.042,1,0,0.667,0.333,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.137,0.042,1,0,0.0,0.667,1.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
3,0.15,0.042,1,0,0.667,0.667,0.5,1,0,1,...,0,1,0,0,0,0,0,0,0,0
4,0.221,0.042,0,0,0.0,0.667,0.5,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [117]:
import ppscore as pps

In [118]:
pps_score = pps.predictors(df,y='income')
pps_score = pps_score.sort_values('ppscore',ascending=False)
print(pps_score.head(10))

             x       y  ppscore        case  is_valid_score  \
0       fnlwgt  income        0  regression            True   
64        Hong  income        0  regression            True   
74   Nicaragua  income        0  regression            True   
73      Mexico  income        0  regression            True   
72        Laos  income        0  regression            True   
71       Japan  income        0  regression            True   
70     Jamaica  income        0  regression            True   
69       Italy  income        0  regression            True   
68     Ireland  income        0  regression            True   
67        Iran  income        0  regression            True   

                 metric  baseline_score  model_score                    model  
0   mean absolute error          0.2558     0.380828  DecisionTreeRegressor()  
64  mean absolute error          0.2558     0.380713  DecisionTreeRegressor()  
74  mean absolute error          0.2558     0.380798  DecisionTree

In [123]:
top_future = pps_score[pps_score['ppscore']>0.05]

In [120]:
future_df = df[top_future]
target_df = df['income']

In [121]:
x_train,x_test,y_train,y_test = train_test_split(future_df,target_df,test_size=0.80,random_state=150)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6125, 103)
(24500, 103)
(6125,)
(24500,)


In [122]:
x_train.columns = x_train.columns.astype(str)
x_test.columns = x_test.columns.astype(str)
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
accuracy_score(y_test,y_pred)

0.7480816326530613

###### Standard Scaling (Z-score)

Best when data follows a normal distribution or algorithms assume Gaussian distribution.

Used in Linear Regression, Logistic Regression, SVM, PCA, Neural Networks.

Handles outliers better than Min-Max (doesn’t squash everything to 0–1).

Pros: Robust when features have very different scales.

Cons: Outliers can distort mean/variance, but effect is less severe than with Min-Max.

###### Min-Max Scaling

Best when you need features strictly bounded in a range (e.g., [0, 1]).

Often used in Neural Networks (sigmoid/tanh activations), KNN, Clustering.

Sensitive to outliers (a single extreme value can stretch the scale).

Pros: Keeps relative relationships between values.

Cons: Very sensitive to outliers → one extreme value can compress everything else into a narrow range.