# Data Preprocessing and Feature Engineering

In [None]:

import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load dataset
data = pd.read_csv('/mnt/data/adult_with_headers.csv')
data.head()
    

## Summary Statistics

In [None]:
data.describe()

## Missing Values

In [None]:
data.isnull().sum()

## Scaling the Data

In [None]:

scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns

standard_scaled_data = pd.DataFrame(scaler_standard.fit_transform(data[numeric_columns]), columns=numeric_columns)
minmax_scaled_data = pd.DataFrame(scaler_minmax.fit_transform(data[numeric_columns]), columns=numeric_columns)

standard_scaled_data.head()
    

## Encoding Categorical Data

In [None]:

categorical_columns = data.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse=False)
label_encoder = LabelEncoder()

encoded_data = data.copy()

for col in categorical_columns:
    if data[col].nunique() < 5:
        one_hot_encoded = pd.get_dummies(data[col], prefix=col)
        encoded_data = pd.concat([encoded_data, one_hot_encoded], axis=1).drop(columns=col)
    else:
        encoded_data[col] = label_encoder.fit_transform(data[col])
        
encoded_data.head()
    

## Feature Engineering

In [None]:

# Feature engineering with bins for 'hours_per_week'
encoded_data['age_bins'] = pd.cut(encoded_data['age'], bins=[0, 25, 50, 75, 100], labels=['Young', 'Middle-aged', 'Senior', 'Old'])
encoded_data['hours_per_week_bins'] = pd.cut(encoded_data['hours_per_week'], bins=[0, 20, 40, 60, 100], labels=['Low', 'Medium', 'High', 'Very High'])
encoded_data.head()
    

## Outlier Detection with Isolation Forest

In [None]:

iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(data[numeric_columns])
encoded_data['outlier'] = outliers
encoded_data.head()
    

## Feature Importance Visualization

In [None]:

# Ensuring that categorical features are properly encoded for model training
encoded_data['age_bins'] = label_encoder.fit_transform(encoded_data['age_bins'].astype(str))
encoded_data['hours_per_week_bins'] = label_encoder.fit_transform(encoded_data['hours_per_week_bins'].astype(str))

# Dropping unnecessary columns and preparing data for classification
X = encoded_data.drop(columns=['outlier', 'income'], errors='ignore')
y = data['income']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Getting feature importances
feature_importances = rf_model.feature_importances_
features = X.columns

# Creating a DataFrame for feature importances
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Visualizing feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest Classifier')
plt.show()
    