In [9]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer

df = pd.read_csv("adult_with_headers (1).csv")

df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace('-', '_')
)

print("Dataset Loaded Successfully")
print("Shape:", df.shape)

## BASIC DATA EXPLORATION

print("\n--- Dataset Info ---")
df.info()

print("\n--- Summary Statistics ---")
print(df.describe())

df.replace("?", np.nan, inplace=True)

print("\n--- Missing Values Before Imputation ---")
print(df.isnull().sum())

## HANDLE MISSING VALUES

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Numerical → Median Imputation
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Categorical → Mode Imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print("\n--- Missing Values After Imputation ---")
print(df.isnull().sum())

## SCALING TECHNIQUES

# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = standard_scaler.fit_transform(df[num_cols])

# Min-Max Scaling 
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = minmax_scaler.fit_transform(df[num_cols])

print("\nScaling applied successfully")

## ENCODING TECHNIQUES
low_cardinality = [col for col in cat_cols if df[col].nunique() < 5]
high_cardinality = [col for col in cat_cols if df[col].nunique() >= 5]

print("\nLow Cardinality Columns:", low_cardinality)
print("High Cardinality Columns:", high_cardinality)

# One-Hot Encoding (Low Cardinality)
df_encoded = pd.get_dummies(df, columns=low_cardinality, drop_first=True)

# Label Encoding (High Cardinality)
label_encoder = LabelEncoder()
for col in high_cardinality:
    df_encoded[col] = label_encoder.fit_transform(df_encoded[col])

print("\nEncoding completed")
print("Encoded Shape:", df_encoded.shape)

## FEATURE ENGINEERING 

# Capital Balance
if 'capital_gain' in df_encoded.columns and 'capital_loss' in df_encoded.columns:
    df_encoded['capital_balance'] = (
        df_encoded['capital_gain'] - df_encoded['capital_loss']
    )

# Work Intensity
if 'hours_per_week' in df_encoded.columns:
    df_encoded['work_intensity'] = np.where(
        df_encoded['hours_per_week'] >= 40,
        'full_time',
        'part_time'
    )
    df_encoded['work_intensity'] = label_encoder.fit_transform(
        df_encoded['work_intensity']
    )

print("\nFeature engineering completed")

# 7. FEATURE TRANSFORMATION

# Log transformation for skewed feature
if 'capital_gain' in df_encoded.columns:
    df_encoded['log_capital_gain'] = np.log1p(df_encoded['capital_gain'])

print("\nLog transformation applied")

print("\n--- Final Dataset Info ---")
df_encoded.info()

print("\n--- Final Dataset Preview ---")
print(df_encoded.head())



Dataset Loaded Successfully
Shape: (32561, 15)

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

--- Summary Statistics 


# EXPLANATION
SCALING:
- Standard Scaling is preferred when data follows a normal distribution
  and algorithms like Logistic Regression, SVM, or PCA are used.
- Min-Max Scaling is preferred when data must be bounded (0–1),
  useful for Neural Networks and distance-based algorithms.

ENCODING:
- One-Hot Encoding avoids ordinal bias and is best for low-cardinality features.
- Label Encoding is memory-efficient but may introduce ordinal relationships.

FEATURE ENGINEERING:
- capital_balance gives net financial impact.
- work_intensity captures employment behavior.
- Log transformation reduces skewness and stabilizes variance.