## **Preprocessing Based on EDA**

Based on Evidence from EDA, I will follow this:
1. Missing Value cross-checking
2. Outlier Handling (As pushed from EDA)
3. Skewness Correlation
4. Feature Engineering

In [10]:
# Importing the necessary libraries

import warnings
warnings.filterwarnings("ignore")

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix

# Statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

Libraries imported successfully!


In [11]:
# Loading the data from EDA
data = pd.read_csv("cleaned_EDA_data.csv")
data.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


**1. Missing Value Cross-Checking**

In [12]:
missing_values = data.isna().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values as expected from EDA.")

No missing values as expected from EDA.


**2. Outlier Handling**

What to take note of is that, so features needs to be engineered before handling outlier. Hence, I will be engineering some features below.

- Age of estab.
- Annual Wage


Feature Engineering

In [13]:
# For creating the Age of estab.
from datetime import datetime

current_year = datetime.now().year
data["Age_of_estab"] = current_year - data["yr_of_estab"]


# For the Annual Wages normalization

# Define conversion factors to yearly wage
conversion_map = {
    'Hour': 2080,       # 40 hrs × 52 weeks
    'Week': 52,
    'Month': 12,
    'Year': 1
}

# Create annual wage using .map()
data['annual_wage'] = data['prevailing_wage'] * data['unit_of_wage'].map(conversion_map)
data['monthly_wage'] = data['annual_wage'] / 12
data['weekly_wage'] = data['annual_wage'] / 52
data['hourly_wage'] = data['annual_wage'] / 2080

In [14]:
data.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status,Age_of_estab,annual_wage,monthly_wage,weekly_wage,hourly_wage
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied,18,1231782.032,102648.502667,23688.116,592.2029
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified,23,83425.65,6952.1375,1604.339423,40.108486
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied,17,122996.86,10249.738333,2365.324231,59.133106
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied,128,83434.03,6952.835833,1604.500577,40.112514
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified,20,149907.39,12492.2825,2882.834423,72.070861


**Log-Transform Skewed Variables (EDA Recommendation)**

In [15]:
# Log transform skewed variables as recommended by EDA

skewed_vars = ['no_of_employees', 'yr_of_estab', 'annual_wage', 'monthly_wage', 'weekly_wage', 'hourly_wage']

for var in skewed_vars:
    if var in data.columns:
        # Check if variable has zero or negative missing_values
        min_var = data[var].min()
        if min_var <= 0:
            # Use log1p for variables with zeros
            data[f'{var}_log'] = np.log1p(data[var])
            print(f"{var}: Applied log1p transformation (had {min_var:.3f} minimum value)")
        else:
            # Use log for positive values only
            data[f'{var}_log'] = np.log(data[var])
            print(f"✓ {var}: Applied log transformation")
        
        # Check skewness before and after
        original_skew = skew(data[var])
        transformed_skew = skew(data[f'{var}_log'])
        print(f"  Original skewness: {original_skew:.3f} → Transformed skewness: {transformed_skew:.3f}")

print(f"\nDataset shape after log transformation: {data.shape}")
print("New log-transformed columns:", [col for col in data.columns if '_log' in col])

        

no_of_employees: Applied log1p transformation (had -26.000 minimum value)
  Original skewness: 12.265 → Transformed skewness: nan
✓ yr_of_estab: Applied log transformation
  Original skewness: -2.037 → Transformed skewness: -2.082
✓ annual_wage: Applied log transformation
  Original skewness: 8.976 → Transformed skewness: 0.014
✓ monthly_wage: Applied log transformation
  Original skewness: 8.976 → Transformed skewness: 0.014
✓ weekly_wage: Applied log transformation
  Original skewness: 8.976 → Transformed skewness: 0.014
✓ hourly_wage: Applied log transformation
  Original skewness: 8.976 → Transformed skewness: 0.014

Dataset shape after log transformation: (25480, 22)
New log-transformed columns: ['no_of_employees_log', 'yr_of_estab_log', 'annual_wage_log', 'monthly_wage_log', 'weekly_wage_log', 'hourly_wage_log']


**Outlier Treatment (EDA Recommendation)**

Based on EDA findings, handle outliers using IQR-capping methods

In [None]:
# Outlier treatment based on EDA recommendations
print("=== OUTLIER TREATMENT (IQR_CAPPING METHOD) ===")

# Let's define numerical columns excluding the target
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
if 'case_status' in numerical_cols:
    numerical_cols.remove('case_status')

print(f"Treating outliers in {len(numerical_cols)} numerical features...\n")

# Apply IQR-capping method
outliers_capped = 0
for col in numerical_cols:
    Q1 = data[col]   .quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers before capping
    outliers_before = ((data[col] < lower_bound) | (data[col] > upper_bound)).sum()
    

    if outliers_before > 0:
        # Cap outliers
        data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
        data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
        outliers_capped += outliers_before
        print(f"✓ {col}: Capped {outliers_before} outliers\n")

print(f"\nTotal outliers capped: {outliers_capped}")
print(f"Dataset shape after outlier treatment: {data.shape}")


=== OUTLIER TREATMENT (IQR_CAPPING METHOD) ===
Treating outliers in 14 numerical features...
✓ no_of_employees: Capped 1556 outliers

✓ yr_of_estab: Capped 3260 outliers

✓ prevailing_wage: Capped 427 outliers

✓ Age_of_estab: Capped 3260 outliers

✓ annual_wage: Capped 2387 outliers

✓ monthly_wage: Capped 2387 outliers

✓ weekly_wage: Capped 2387 outliers

✓ hourly_wage: Capped 2387 outliers

✓ no_of_employees_log: Capped 1890 outliers

✓ yr_of_estab_log: Capped 3291 outliers

✓ annual_wage_log: Capped 2838 outliers

✓ monthly_wage_log: Capped 2838 outliers

✓ weekly_wage_log: Capped 2838 outliers

✓ hourly_wage_log: Capped 2838 outliers


Total outliers capped: 34584
Dataset shape after outlier treatment: (25480, 22)
