In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps

# Load the dataset
data = pd.read_csv('adult_with_headers.csv')

# Basic Data Exploration
print("Summary Statistics:")
print(data.describe(include='all'))

print("\nMissing Values:")
print(data.isnull().sum())

print("\nData Types:")
print(data.dtypes)

# Handle Missing Values
# Example of imputation
# Replace 'column_name' with actual column names as necessary
# data['column_name'].fillna(data['column_name'].median(), inplace=True)

# Apply Scaling Techniques
# Example numerical feature scaling
numerical_features = ['age', 'hours-per-week']  # Replace with actual numerical features

# Standard Scaling
scaler_standard = StandardScaler()
data[numerical_features] = scaler_standard.fit_transform(data[numerical_features])

# Min-Max Scaling
# Uncomment if you want to apply Min-Max Scaling to specific features
# scaler_minmax = MinMaxScaler()
# data[numerical_features] = scaler_minmax.fit_transform(data[numerical_features])

# Encoding Techniques
# One-Hot Encoding for categorical variables with <5 categories
data = pd.get_dummies(data, columns=['categorical_feature1', 'categorical_feature2'], drop_first=True)  # Replace with actual columns

# Label Encoding for categorical variables with >5 categories
label_encoder = LabelEncoder()
data['categorical_feature'] = label_encoder.fit_transform(data['categorical_feature'])  # Replace with actual column

# Feature Engineering
# New Features
data['age_group'] = pd.cut(data['age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '66+'])
data['income_to_job_ratio'] = data['income'] / data['hours-per-week']  # Adjust based on actual columns

# Log Transformation on skewed numerical features
data['log_hours_per_week'] = np.log1p(data['hours-per-week'])  # Adjust if needed

# Feature Selection
# Outlier Detection with Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(data.select_dtypes(include=['float64', 'int64']))

# Remove outliers
data = data[outliers != -1]

# Predictive Power Score
pps_matrix = pps.score(data)
print("\nPredictive Power Score Matrix:")
print(pps_matrix)

# Correlation Matrix
correlation_matrix = data.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)
