In [None]:
# Data Preprocessing, Feature Engineering, and Feature Selection: Adult Income Prediction Dataset
# The dataset, often referred to as the "Adult" dataset,
# is a popular dataset in machine learning that contains census data for predicting whether a person earns more than $50K per year based on attributes like age, education, occupation, and more.

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'D:\HI448116_Santosh_Karpe\FY25\DOCS\III\Ass\ASA - SK\adult_with_headers.csv')

# Basic data exploration
print(df.head())  # Display first 5 rows
print(df.info())  # Data types and non-null count
print(df.describe())  # Summary statistics for numerical columns

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [2]:
# Check for missing values
print(df.isnull().sum())

# Impute missing values (example: median imputation for numerical columns)
df.fillna(df.median(), inplace=True)

# Or, drop rows with missing values
df.dropna(inplace=True)

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


TypeError: Cannot convert [[' State-gov' ' Self-emp-not-inc' ' Private' ... ' Private' ' Private'
  ' Self-emp-inc']
 [' Bachelors' ' Bachelors' ' HS-grad' ... ' HS-grad' ' HS-grad'
  ' HS-grad']
 [' Never-married' ' Married-civ-spouse' ' Divorced' ... ' Widowed'
  ' Never-married' ' Married-civ-spouse']
 ...
 [' Male' ' Male' ' Male' ... ' Female' ' Male' ' Female']
 [' United-States' ' United-States' ' United-States' ... ' United-States'
  ' United-States' ' United-States']
 [' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']] to numeric

In [None]:
#Scaling Techniques
#To scale numerical features, we’ll apply Standard Scaling and Min-Max Scaling.
#Scaling ensures that features with larger ranges do not dominate models that rely on distance-based metrics (e.g., KNN, SVM).
#Standard Scaling
#Standard scaling standardizes the data by subtracting the mean and dividing by the standard deviation. 
#This centers the data around 0 with a unit variance.

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled_standard = df.copy()
df_scaled_standard[['age', 'education_num']] = scaler.fit_transform(df[['age', 'education_num']])

In [None]:
# Min-Max Scaling
# Min-max scaling rescales features to a range of [0, 1], making it ideal for algorithms that require data within this range (e.g., neural networks).

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled_minmax = df.copy()
df_scaled_minmax[['age', 'education_num']] = scaler.fit_transform(df[['age', 'education_num']])

In [None]:
#Encoding Techniques
#One-Hot Encoding for Categorical Variables with Less than 5 Categories
#One-hot encoding converts categorical variables into binary (0 or 1) features. 
#It is useful for categorical variables that have fewer unique categories (less than 5).

In [11]:
df_encoded = pd.get_dummies(df, columns=['workclass', 'education', ], drop_first=True)

In [None]:
#Label Encoding for Categorical Variables with More than 5 Categories

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['occupation_encoded'] = le.fit_transform(df['occupation'])

In [None]:
#3. Feature Engineering
#Creating New Features
#Feature engineering involves creating new features that could provide valuable information to the model.
#Feature 1: Age Group Create a feature to categorize the individuals into age groups:

In [13]:
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 30, 40, 50, 60, 100], labels=['<18', '18-30', '30-40', '40-50', '50-60', '60+'])


In [None]:
#Feature 2: Work Hours Over 40 Create a feature to indicate if the person works more than 40 hours per week:

In [15]:
df['work_over_40'] = df['hours_per_week'].apply(lambda x: 1 if x > 40 else 0)

In [None]:
#Log Transformation of Skewed Features
#Skewed numerical features (like income) can be transformed using a log transformation to normalize the data distribution, 
#which can help improve model performance, especially for linear models.

In [16]:
import numpy as np

# Apply log transformation
df['log_hours_per_week'] = np.log1p(df['hours_per_week'])

In [None]:
#Feature Selection
#Outlier Detection with Isolation Forest
#Outliers can severely affect model performance, especially with distance-based models. 
#One technique to detect and remove outliers is using the Isolation Forest algorithm.

In [18]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df[['age', 'hours_per_week']])

# Mark outliers with -1 and inliers with 1
df['outlier'] = outliers
df_no_outliers = df[df['outlier'] == 1]

In [None]:
#PPS (Predictive Power Score)
#The PPS score quantifies the predictive power of a feature with respect to the target variable.
#It helps identify features that have the most influence on the target and can guide feature selection.

In [None]:
import ppscore as pps

# Compute PPS matrix
pps_matrix = pps.matrix(df)
print(pps_matrix)