In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import pandas as pd
import numpy as np

# Step 0: Load the dataset

# Option 1: Load from a local file
adult_df = pd.read_csv("adult_with_headers.csv")  


# Step 1: Basic data exploration
summary_stats = adult_df.describe(include='all')
data_types = adult_df.dtypes
missing_values = adult_df.isin(['?', ' ?']).sum()

# Replace '?' with np.nan for proper missing value handling
adult_df.replace(' ?', np.nan, inplace=True)

# Step 2: Handle missing values - drop rows with any missing values
adult_df_cleaned = adult_df.dropna()

# Step 3: Identify numerical features
numerical_features = adult_df_cleaned.select_dtypes(include=[np.number]).columns.tolist()

# Apply Standard Scaling
standard_scaler = StandardScaler()
standard_scaled = pd.DataFrame(
    standard_scaler.fit_transform(adult_df_cleaned[numerical_features]),
    columns=[f"{col}_std" for col in numerical_features]
)

# Apply Min-Max Scaling
minmax_scaler = MinMaxScaler()
minmax_scaled = pd.DataFrame(
    minmax_scaler.fit_transform(adult_df_cleaned[numerical_features]),
    columns=[f"{col}_mm" for col in numerical_features]
)

# Combine original and scaled data for comparison (first 5 rows)
scaled_comparison = pd.concat([adult_df_cleaned[numerical_features].reset_index(drop=True),
                               standard_scaled, minmax_scaled], axis=1).head()

summary_stats, data_types, missing_values, scaled_comparison


(                 age workclass        fnlwgt education  education_num  \
 count   32561.000000     32561  3.256100e+04     32561   32561.000000   
 unique           NaN         9           NaN        16            NaN   
 top              NaN   Private           NaN   HS-grad            NaN   
 freq             NaN     22696           NaN     10501            NaN   
 mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
 std        13.640433       NaN  1.055500e+05       NaN       2.572720   
 min        17.000000       NaN  1.228500e+04       NaN       1.000000   
 25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
 50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
 75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
 max        90.000000       NaN  1.484705e+06       NaN      16.000000   
 
              marital_status       occupation relationship    race    sex  \
 count                 32561     

In [9]:
#Data Exploration and Preprocessing Summary
#1. Summary Statistics & Data Types
#Dataset contains 32,561 entries.

#Key numerical columns: age, fnlwgt, education_num, capital_gain, capital_loss, hours_per_week.

#Several categorical columns like workclass, education, marital_status, etc.

#2. Missing Values
#Represented as '?' in the dataset. After conversion:

#workclass: 1,836 missing

#occupation: 1,843 missing

#native_country: 583 missing

#Handled by row-wise removal (dropna()), reducing the dataset to clean entries for processing.

#3. Scaling
#🔹 Standard Scaling: Transforms data to mean = 0, std = 1 using z-score.

#Preferred when:

#Data has normal distribution.

#Algorithms assume Gaussian data (e.g., Logistic Regression, SVM, Linear Regression).

#🔹 Min-Max Scaling: Scales values to [0, 1].

#Preferred when:

#Features need to be within a fixed range.

#Algorithms are distance-based (e.g., KNN, Neural Networks).

#4. Comparison Table (First 5 Rows)
#We compared original, Standard Scaled, and Min-Max Scaled versions of numerical features. Sample shown for age, capital_gain, etc.

In [11]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = adult_df_cleaned.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('income')  # Exclude the target variable for now

# Count unique values to determine encoding strategy
cat_unique_counts = adult_df_cleaned[categorical_cols].nunique()

# One-Hot Encoding for categorical features with <= 5 unique values
one_hot_cols = cat_unique_counts[cat_unique_counts <= 5].index.tolist()
one_hot_encoded = pd.get_dummies(adult_df_cleaned[one_hot_cols], prefix=one_hot_cols)

# Label Encoding for categorical features with > 5 unique values
label_encoded = adult_df_cleaned.copy()
label_encoders = {}
for col in cat_unique_counts[cat_unique_counts > 5].index:
    le = LabelEncoder()
    label_encoded[col] = le.fit_transform(label_encoded[col])
    label_encoders[col] = le

# Combine the encoded columns for inspection
encoded_summary = pd.concat([label_encoded[cat_unique_counts[cat_unique_counts > 5].index].head(),
                             one_hot_encoded.head()], axis=1)

cat_unique_counts, encoded_summary


(workclass          7
 education         16
 marital_status     7
 occupation        14
 relationship       6
 race               5
 sex                2
 native_country    41
 dtype: int64,
    workclass  education  marital_status  occupation  relationship  \
 0          5          9               4           0             1   
 1          4          9               2           3             0   
 2          2         11               0           5             1   
 3          2          1               2           5             0   
 4          2          9               2           9             5   
 
    native_country  race_ Amer-Indian-Eskimo  race_ Asian-Pac-Islander  \
 0              38                     False                     False   
 1              38                     False                     False   
 2              38                     False                     False   
 3              38                     False                     False   
 4               

In [13]:
import numpy as np

# Copy the dataset to avoid modifying the original
df_fe = adult_df.copy()

# Create new feature 1: Age Group
# Categorize age into bins (Young: <30, Middle-aged: 30–60, Senior: >60)
df_fe['age_group'] = pd.cut(df_fe['age'], bins=[0, 29, 59, np.inf], labels=['Young', 'Middle-aged', 'Senior'])

# Create new feature 2: Capital Net Gain
# Difference between capital_gain and capital_loss
df_fe['capital_net'] = df_fe['capital_gain'] - df_fe['capital_loss']

# Check skewness of numerical columns to identify which one to transform
skewed_data = df_fe[['capital_gain', 'capital_loss', 'fnlwgt']].skew()

# Apply log transformation to 'capital_gain' due to high skewness (only if > 0)
df_fe['capital_gain_log'] = df_fe['capital_gain'].apply(lambda x: np.log1p(x))  # log(1 + x) for zero values

# Display the new columns and skewness
df_fe[['age', 'age_group', 'capital_gain', 'capital_gain_log', 'capital_loss', 'capital_net']].head(), skewed_data


(   age    age_group  capital_gain  capital_gain_log  capital_loss  capital_net
 0   39  Middle-aged          2174          7.684784             0         2174
 1   50  Middle-aged             0          0.000000             0            0
 2   38  Middle-aged             0          0.000000             0            0
 3   53  Middle-aged             0          0.000000             0            0
 4   28        Young             0          0.000000             0            0,
 capital_gain    11.953848
 capital_loss     4.594629
 fnlwgt           1.446980
 dtype: float64)

In [15]:
from sklearn.ensemble import IsolationForest

# Selecting only numerical columns for outlier detection
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
numerical_data = adult_df[numerical_cols]

# Applying Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = iso_forest.fit_predict(numerical_data)

# Mark outliers
adult_df['Outlier'] = outliers

# Count of outliers and non-outliers
outlier_counts = adult_df['Outlier'].value_counts()

# Remove outliers (where prediction == -1)
adult_df_cleaned = adult_df[adult_df['Outlier'] == 1].drop(columns=['Outlier'])

outlier_counts


Outlier
 1    32235
-1      326
Name: count, dtype: int64

In [16]:
import pandas as pd

# Load the dataset
file_path = "adult_with_headers.csv"
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
# 1.Outlier Detection with Isolation Forest

# 2.Feature Relationship Analysis using PPS and Correlation Matrix


#Proceeding with Isolation Forest now

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Selecting numerical features for Isolation Forest
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_features])

# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = iso_forest.fit_predict(scaled_data)

# Add results to the dataframe
df['outlier'] = outliers

# Count of outliers detected
outlier_counts = df['outlier'].value_counts()

# Remove outliers
df_cleaned = df[df['outlier'] == 1].drop(columns='outlier')

outlier_counts
#The Isolation Forest algorithm detected 326 outliers out of 32,561 total entries (~1%). These outliers were removed for further analysis

outlier
 1    32235
-1      326
Name: count, dtype: int64

In [21]:
#Why Removing Outliers Matters:
#Skewed Distributions: Outliers can distort the mean and variance, affecting models like linear regression and clustering.

#Overfitting Risk: Models may try to fit these rare points, reducing generalization.

#Improved Accuracy: Removal often leads to more robust and reliable models.
#1. Impact of Outliers on Model Performance
#Outliers are data points that significantly deviate from the norm and can adversely affect machine learning models in several ways:

#Skewed Model Parameters: Outliers can disproportionately influence models like linear regression, leading to biased coefficients and poor generalization.
#Overfitting: Models may overfit to outliers, especially in tree-based models, reducing performance on typical data.
#Increased Variance: Outliers can inflate variance in predictions, making models less stable.
#Distorted Metrics: Performance metrics (e.g., RMSE) can be skewed by outliers, misrepresenting model quality.
#Feature Importance Errors: Outliers may mislead feature importance rankings in algorithms like random forests.
#Removing outliers can improve model robustness, but care must be taken to ensure they are true anomalies and not meaningful rare events.


In [23]:
#""" 2. Outlier Detection with Isolation Forest
#Isolation Forest is an unsupervised algorithm that isolates anomalies by randomly selecting features and splitting values. Outliers are identified as points requiring fewer splits to isolate due to their distinctiveness.

#Implementation
#I'll apply Isolation Forest to the numerical features in the dataset (age, fnlwgt, education_num, capital_gain, capital_loss, hours_per_week). Categorical features will be excluded from outlier detection but retained for later analysis."""

import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('adult_with_headers.csv')

# Select numerical features for outlier detection
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
X = data[numerical_features]

# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)  # Assuming 10% outliers
outlier_labels = iso_forest.fit_predict(X)

# Add outlier labels to the dataset
data['outlier'] = outlier_labels

# Remove outliers (outlier_label = -1 indicates an outlier)
cleaned_data = data[data['outlier'] == 1].drop(columns=['outlier'])
print(f"Original dataset size: {len(data)}")
print(f"Cleaned dataset size: {len(cleaned_data)}")

#Results
#Original dataset size: 32561 (assuming full dataset; sample provided was truncated).
#Cleaned dataset size: Approximately 90% of the original size (e.g., ~29305 if 10% are outliers).
#Outliers Removed: ~10% of the data, as specified by the contamination parameter.
#The Isolation Forest identified outliers based on numerical features. For example, individuals with extreme capital_gain (e.g., 99999) or unusual hours_per_week (e.g., 1 or 99) are likely flagged. The cleaned dataset is now more representative of typical patterns, reducing the risk of model bias.

Original dataset size: 32561
Cleaned dataset size: 29305


In [None]:
# === Install Required Packages (Jupyter Only) ===
%pip install -q ppscore

# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore")

import ppscore as pps

# === Load Data ===
df = pd.read_csv("adult_with_headers.csv")
df.columns = df.columns.str.strip()  # Clean column names

# === Preview Data ===
print("Initial shape:", df.shape)
display(df.head())

# === Step 1: Remove Outliers using Isolation Forest (Optimized) ===
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

iso = IsolationForest(
    contamination=0.05,
    random_state=42,
    n_estimators=50,       # Fewer trees to reduce computation
    max_samples=1000       # Subsample rows for speed
)
outliers = iso.fit_predict(df[numeric_cols])
df_cleaned = df[outliers == 1]  # Keep only inliers

print("Shape after outlier removal:", df_cleaned.shape)

# === Step 2: Calculate Predictive Power Score (PPS) ===
target = 'income'  # Change this if needed

# Sample for PPS to reduce processing time
sample_df = df_cleaned.sample(n=min(1000, len(df_cleaned)), random_state=1)
pps_matrix = pps.matrix(sample_df)

# Filter for target variable
pps_scores = pps_matrix[pps_matrix['y'] == target].sort_values(by='ppscore', ascending=False)

print("\nTop Predictive Features for Target (PPS):")
display(pps_scores[['x', 'ppscore']])

# === Step 3: Correlation Matrix Comparison (Optional) ===
if target in numeric_cols:
    correlation_scores = df_cleaned[numeric_cols].corr()[target].drop(target).sort_values(ascending=False)

    # Merge PPS and Correlation
    comparison_df = pd.DataFrame({
        'Correlation': correlation_scores,
        'PPS': pps_scores.set_index('x')['ppscore']
    }).dropna()

    print("\nComparison of PPS and Correlation with Target:")
    display(comparison_df)

    # Plot
    comparison_df.sort_values('PPS', ascending=False).plot(kind='bar', figsize=(12, 6))
    plt.title('PPS vs Correlation with Target')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print(f"\nTarget column '{target}' is not numeric, correlation skipped.")
