In [1]:
import pandas as pd # for data manipulation
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("adult.csv")

In [3]:
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
print("Initial Data Info after cleaning missing values:")
df.info()

Initial Data Info after cleaning missing values:
<class 'pandas.core.frame.DataFrame'>
Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              45222 non-null  int64 
 1   workclass        45222 non-null  object
 2   fnlwgt           45222 non-null  int64 
 3   education        45222 non-null  object
 4   educational-num  45222 non-null  int64 
 5   marital-status   45222 non-null  object
 6   occupation       45222 non-null  object
 7   relationship     45222 non-null  object
 8   race             45222 non-null  object
 9   gender           45222 non-null  object
 10  capital-gain     45222 non-null  int64 
 11  capital-loss     45222 non-null  int64 
 12  hours-per-week   45222 non-null  int64 
 13  native-country   45222 non-null  object
 14  income           45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [4]:
numerical_features = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'] # education is ordinal but education-num is present
target_feature = 'income'

# Separate features and target
X = df.drop(target_feature, axis=1) # axis=1 means look for the column, 0 will look for row
y = df[target_feature]

# Features vs. Target
# By "dropping" the target feature, you are creating a version of the dataset that contains everything except the answer key.

# If you do not drop the target feature from X, your model will have the "answer" inside its input data. This is called Data Leakage.

# The Result: The model would achieve 100% accuracy during training because it is simply looking at the income column to predict income, but it would be completely useless when trying to predict data where the income is unknown.

In [5]:
# Target variable 'income' is binary, so LabelEncoder is appropriate
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("\nTarget Variable Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# We'll use educational-num instead of text 'education' as it's already encoded ordinally.
categorical_features.remove('education') 


Target Variable Mapping: {'<=50K': np.int64(0), '>50K': np.int64(1)}


In [6]:
# Use ColumnTransformer to apply different preprocessing steps to different columns

# Create the preprocessing pipelines for numerical and categorical data
# Ensure categorical columns are NOT in numerical_features list.
# Set sparse_output=False in OneHotEncoder to prevent sparse matrix errors.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough' # Keep other columns as they are (none in this case)
)

# Apply the preprocessing
X_processed = preprocessor.fit_transform(X)
print(f"\nShape of original data (rows, cols): {X.shape}")
print(f"Shape of processed data (rows, cols): {X_processed.shape}")


Shape of original data (rows, cols): (45222, 14)
Shape of processed data (rows, cols): (45222, 89)


In [7]:
print("--- Readiness Comparison (Numerical Features) ---")
# Before Scaling - Using .agg to avoid KeyError
stats_before = df[numerical_features].agg(['mean', 'std']).T
print("Before Scaling (Raw Data):\n", stats_before)

X_processed_array = preprocessor.fit_transform(X)

# After Scaling
# Reconstruct a temporary DF of the scaled numerical part to check stats
X_scaled_num = pd.DataFrame(X_processed_array[:, :len(numerical_features)], columns=numerical_features)
stats_after = X_scaled_num.agg(['mean', 'std']).T
print("\nAfter Scaling (StandardScaler):\n", stats_after.round(2)) 
# Stats should now show Mean ≈ 0 and Std ≈ 1

--- Readiness Comparison (Numerical Features) ---
Before Scaling (Raw Data):
                           mean            std
age                  38.547941      13.217870
fnlwgt           189734.734311  105639.195134
educational-num      10.118460       2.552881
capital-gain       1101.430344    7506.430084
capital-loss         88.595418     404.956092
hours-per-week       40.938017      12.007508

After Scaling (StandardScaler):
                  mean  std
age              -0.0  1.0
fnlwgt            0.0  1.0
educational-num   0.0  1.0
capital-gain     -0.0  1.0
capital-loss      0.0  1.0
hours-per-week    0.0  1.0


In [8]:
print("\n--- Impact of Scaling on ML Algorithms ---")
print("*   **Distance-based algorithms (KNN, SVM, K-Means)**: Highly affected; scaling ensures all features contribute equally to distance calculations.")
print("*   **Gradient Descent-based algorithms (Logistic Regression, Neural Networks)**: Highly affected; scaling leads to faster convergence and numerical stability.")
print("*   **Tree-based algorithms (Decision Trees, Random Forests, XGBoost)**: Generally invariant to scaling as they rely on split points and relative order, not magnitude.")


--- Impact of Scaling on ML Algorithms ---
*   **Distance-based algorithms (KNN, SVM, K-Means)**: Highly affected; scaling ensures all features contribute equally to distance calculations.
*   **Gradient Descent-based algorithms (Logistic Regression, Neural Networks)**: Highly affected; scaling leads to faster convergence and numerical stability.
*   **Tree-based algorithms (Decision Trees, Random Forests, XGBoost)**: Generally invariant to scaling as they rely on split points and relative order, not magnitude.


In [9]:
# Convert the processed NumPy array X_processed back to a pandas DataFrame for saving with meaningful column names
# Get feature names after one-hot encoding
final_feature_names = preprocessor.get_feature_names_out()

processed_df = pd.DataFrame(
    X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed, 
    columns=final_feature_names
)
processed_df[target_feature] = y_encoded

output_filename = 'adult_processed.csv'
processed_df.to_csv(output_filename, index=False)
print(f"\nSuccessfully saved processed dataset to '{output_filename}'")


Successfully saved processed dataset to 'adult_processed.csv'
