In [2]:
import pandas as pd

In [10]:
df = pd.read_csv("adult_with_headers.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
# Check missing values
print(df.isna().sum())

# Drop rows with missing values
df_drop = df.dropna()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [16]:
#Impute numerical columns with median
num_cols = df.select_dtypes(include=['int64','float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical columns
num_cols = df.select_dtypes(include=['int64','float64']).columns
print(num_cols)

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


In [22]:
# Standard Scaling
std_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = std_scaler.fit_transform(df[num_cols])

df_standard_scaled.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


In [24]:
# Min-Max Scaling
mm_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = mm_scaler.fit_transform(df[num_cols])

df_minmax_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


In [26]:
""" Standard Scaling:It is a preprocessing technique that transforms features so they have a mean of 0 and a standard deviation of 1. Itâ€™s often called Z-score normalization and is widely used in machine learning to ensure features contribute equally to model training
MinMax scaling: It is a technique also called as normalization rescales features to a fixed range, usually . Itâ€™s a simple but powerful technique to make features comparable when they have different units or magnitudes"""

' Standard Scaling:It is a preprocessing technique that transforms features so they have a mean of 0 and a standard deviation of 1. Itâ€™s often called Z-score normalization and is widely used in machine learning to ensure features contribute equally to model training\nMinMax scaling: It is a technique also called as normalization rescales features to a fixed range, usually . Itâ€™s a simple but powerful technique to make features comparable when they have different units or magnitudes'

In [28]:
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Filter columns with < 5 unique categories
small_cat_cols = [col for col in cat_cols if df[col].nunique() < 5]

print("Columns to One-Hot Encode:", small_cat_cols)

Columns to One-Hot Encode: ['sex', 'income']


In [31]:
# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=small_cat_cols, drop_first=False)

print(df_encoded.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race  \
0        Never-married        Adm-clerical   Not-in-family   White   
1   Married-civ-spouse     Exec-managerial         Husband   White   
2             Divorced   Handlers-cleaners   Not-in-family   White   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black   
4   Married-civ-spouse      Prof-specialty            Wife   Black   

   capital_gain  capital_loss  hours_per_week  native_country  sex_ Female  \
0          2174             0              40   United-States        False   
1             0             0         

In [33]:
from sklearn.preprocessing import LabelEncoder

In [35]:
# Initialize Label Encoder
le = LabelEncoder()

# Apply Label Encoding to each categorical column
df_label_encoded = df.copy()
for col in cat_cols:
    df_label_encoded[col] = le.fit_transform(df[col])

print("Label Encoding Completed!")
print(df_label_encoded.head())

Label Encoding Completed!
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0             0             0   

   hours_per_week  native_country  income  
0              40              39       0  
1              13              39     

In [39]:
# -----------------------------
#Summary statistics (numerical)
# -----------------------------
print("ðŸ“Œ SUMMARY STATISTICS (NUMERICAL FEATURES):")
print(df.describe(), "\n")

# -----------------------------
#Summary statistics (categorical)
# -----------------------------
print("ðŸ“Œ SUMMARY STATISTICS (CATEGORICAL FEATURES):")
print(df.describe(include=['object']), "\n")

# -----------------------------
#Missing values check
# -----------------------------
print("ðŸ“Œ MISSING VALUES PER COLUMN:")
print(df.isna().sum(), "\n")
# -----------------------------
# Identify numerical & categorical columns
# -----------------------------
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("ðŸ“Œ NUMERICAL COLUMNS:", list(num_cols))
print("ðŸ“Œ CATEGORICAL COLUMNS:", list(cat_cols))

ðŸ“Œ SUMMARY STATISTICS (NUMERICAL FEATURES):
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000   

ðŸ“Œ SUMMARY ST

In [42]:
# Feature 1: Age Group
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 25, 45, 65, 100],
    labels=['Young', 'Adult', 'Mid-Age', 'Senior']
)

# Feature 2: Total Capital
df['total_capital'] = df['capital_gain'] + df['capital_loss']


In [44]:
import numpy as np

# Apply log1p transformation to skewed numerical features
df['log_capital_gain'] = np.log1p(df['capital_gain'])
df['log_capital_loss'] = np.log1p(df['capital_loss'])


In [45]:
"""Justification:The variables capital_gain and capital_loss are highly positively skewed, containing mostly zeros and a few extremely large values. This can distort learning in many machine learning models. Applying a log(1 + x) transformation reduces the skewness by compressing the large outliers and spreading out smaller values. This helps the model learn more balanced patterns and improves overall predictive performance"""

'Justification:The variables capital_gain and capital_loss are highly positively skewed, containing mostly zeros and a few extremely large values. This can distort learning in many machine learning models. Applying a log(1 + x) transformation reduces the skewness by compressing the large outliers and spreading out smaller values. This helps the model learn more balanced patterns and improves overall predictive performance'