# Data Preprocessing and Feature Engineering

In [None]:
# Load dataset
import pandas as pd
data = pd.read_csv('/mnt/data/adult_with_headers.csv')
data.head()

## Summary Statistics

In [None]:
data.describe()

## Missing Values

In [None]:
data.isnull().sum()

## Scaling the Data

In [None]:

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

standard_scaled_data = pd.DataFrame(scaler_standard.fit_transform(data[numeric_columns]), columns=numeric_columns)
minmax_scaled_data = pd.DataFrame(scaler_minmax.fit_transform(data[numeric_columns]), columns=numeric_columns)

standard_scaled_data.head()

## Encoding Categorical Data

In [None]:

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

categorical_columns = data.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse=False)
label_encoder = LabelEncoder()

encoded_data = data.copy()

for col in categorical_columns:
    if data[col].nunique() < 5:
        one_hot_encoded = pd.get_dummies(data[col], prefix=col)
        encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1).drop(columns=col)
    else:
        encoded_data[col] = label_encoder.fit_transform(data[col])
        
encoded_data.head()

## Feature Engineering

In [None]:

# Feature engineering with bins for 'hours_per_week'
encoded_data['age_bins'] = pd.cut(encoded_data['age'], bins=[0, 25, 50, 75, 100], labels=['Young', 'Middle-aged', 'Senior', 'Old'])
encoded_data['hours_per_week_bins'] = pd.cut(encoded_data['hours_per_week'], bins=[0, 20, 40, 60, 100], labels=['Low', 'Medium', 'High', 'Very High'])
encoded_data.head()

## Outlier Detection with Isolation Forest

In [None]:

from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(data[numeric_columns])
encoded_data['outlier'] = outliers
encoded_data.head()