# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

### Objective:
This assignment aims to equip you with practical skills in data preprocessing, feature engineering, and feature selection techniques, which are crucial for building efficient machine learning models. You will work with a provided dataset to apply various techniques such as scaling, encoding, and feature selection methods including isolation forest and PPS score analysis.
Dataset:
Given "Adult" dataset, which predicts whether income exceeds $50K/yr based on census data.

Tasks:
1. Data Exploration and Preprocessing:
2. Encoding Techniques:
3. Feature Engineering:
4. Feature Selection:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
data = pd.read_csv("C:\\Users\\SHUBHAM GARKAL\\Downloads\\EDA2\\EDA2\\adult_with_headers.csv")
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Data Exploration and Preprocessing

# Summary statistics
summary_stats = data.describe()
print(summary_stats)

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [4]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [5]:
# Handle missing values

# Let's assume we'll impute missing values with the mean for numerical features
data.fillna(data.mean(), inplace=True)

In [6]:
# Scaling techniques
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

In [7]:
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
data[numerical_features] = scaler_standard.fit_transform(data[numerical_features])
data[numerical_features] = scaler_minmax.fit_transform(data[numerical_features])

In [8]:
# Encoding Techniques

# One-Hot Encoding for categorical variables with less than 5 categories
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
for feature in categorical_features:
    if len(data[feature].unique()) < 5:
        data = pd.get_dummies(data, columns=[feature])

In [9]:
# Label Encoding for categorical variables with more than 5 categories
label_encoder = LabelEncoder()
for feature in categorical_features:
    if len(data[feature].unique()) >= 5:
        data[feature] = label_encoder.fit_transform(data[feature])

In [10]:
# Feature Engineering

# Create new features
data['capital_diff'] = data['capital_gain'] - data['capital_loss']
data['working_hours_per_age'] = data['hours_per_week'] / data['age']

In [11]:
# Apply log transformation to skewed numerical feature
skewed_feature = 'capital_gain'  # Example numerical feature
data[skewed_feature] = np.log(data[skewed_feature] + 1)  # Add 1 to avoid log(0)

In [12]:
# Feature Selection
# Isolation Forest for outlier detection and removal
isolation_forest = IsolationForest(contamination=0.05)
outlier_labels = isolation_forest.fit_predict(data[numerical_features])
data = data[outlier_labels == 1]

In [13]:
# PPS analysis
pps_matrix = pps.matrix(data)
print(pps_matrix)

                         x                      y  ppscore            case  \
0                      age                    age      1.0  predict_itself   
1                      age              workclass      0.0      regression   
2                      age                 fnlwgt      0.0      regression   
3                      age              education      0.0      regression   
4                      age          education_num      0.0      regression   
..                     ...                    ...      ...             ...   
284  working_hours_per_age         hours_per_week      0.0   unknown_error   
285  working_hours_per_age         native_country      0.0   unknown_error   
286  working_hours_per_age                 income      0.0   unknown_error   
287  working_hours_per_age           capital_diff      0.0   unknown_error   
288  working_hours_per_age  working_hours_per_age      1.0  predict_itself   

     is_valid_score               metric  baseline_score  model

In [14]:
# Compare with correlation matrix
correlation_matrix = data.corr()
print(correlation_matrix)

                            age  workclass    fnlwgt  education  \
age                    1.000000   0.012895 -0.080479  -0.002302   
workclass              0.012895   1.000000 -0.017496   0.017946   
fnlwgt                -0.080479  -0.017496  1.000000  -0.025604   
education             -0.002302   0.017946 -0.025604   1.000000   
education_num          0.031230   0.043622 -0.043758   0.350001   
marital_status        -0.287492  -0.060295  0.028393  -0.033912   
occupation            -0.015841   0.246716 -0.001455  -0.026481   
relationship          -0.260976  -0.092034  0.008098  -0.010283   
race                   0.026185   0.049826 -0.024027   0.012913   
capital_gain           0.067878   0.007267 -0.027906   0.017938   
capital_loss          -0.002070  -0.005915 -0.003949   0.017400   
hours_per_week         0.097206   0.128098 -0.017551   0.052112   
native_country         0.001294  -0.004776 -0.052704   0.063389   
capital_diff           0.018829   0.007498 -0.003242  -0.01222