1. Data Exploration and Preprocessing:

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [23]:
# Load dataset
df = pd.read_csv("adult_with_headers.csv")

# Replace '?' with NaN for proper missing value handling
df.replace('?', np.nan, inplace=True)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# Handle missing values: Fill categorical NaNs with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Handle missing values: Fill numerical NaNs with median
num_imputer = SimpleImputer(strategy='median')
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])



Missing values per column:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [25]:
# Scaling numerical features
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_cols] = scaler_standard.fit_transform(df_standard_scaled[numerical_cols])

scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_cols] = scaler_minmax.fit_transform(df_minmax_scaled[numerical_cols])


In [27]:
# Encoding categorical variables
one_hot_cols = [col for col in categorical_cols if df[col].nunique() <= 5]
label_encoded_cols = [col for col in categorical_cols if df[col].nunique() > 5]

In [29]:
# One-Hot Encoding
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_cols]))
one_hot_encoded.columns = one_hot_encoder.get_feature_names_out(one_hot_cols)
df = df.drop(columns=one_hot_cols).reset_index(drop=True)
df = pd.concat([df, one_hot_encoded], axis=1)

In [31]:
# Label Encoding
label_encoder = LabelEncoder()
for col in label_encoded_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [33]:
# Final dataset after preprocessing
print("Processed Data Sample:\n", df.head())

Processed Data Sample:
     age  workclass    fnlwgt  education  education_num  marital_status  \
0  39.0          7   77516.0          9           13.0               4   
1  50.0          6   83311.0          9           13.0               2   
2  38.0          4  215646.0         11            9.0               0   
3  53.0          4  234721.0          1            7.0               2   
4  28.0          4  338409.0          9           13.0               2   

   occupation  relationship  capital_gain  capital_loss  hours_per_week  \
0           1             1        2174.0           0.0            40.0   
1           4             0           0.0           0.0            13.0   
2           6             1           0.0           0.0            40.0   
3           6             0           0.0           0.0            40.0   
4          10             5           0.0           0.0            40.0   

   native_country  race_ Asian-Pac-Islander  race_ Black  race_ Other  \
0      