# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

# Data Loading & Basic Exploration

In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

# Step 2: Load the dataset
df = pd.read_csv('/Users/reddyharshayadhav/Downloads/Data Trasformation/adult_with_headers (1).csv')

# Step 3: Basic exploration
print("First 5 rows:")
print(df.head())
print("\nSummary statistics:")
print(df.describe(include='all'))
print("\nMissing values by column:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

First 5 rows:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United

# Handling Missing Values

In [3]:
# Step 4: Handle missing values properly (no errors/warnings)

import pandas as pd
import numpy as np

for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        # Only impute with median if column is numeric
        df[col] = df[col].fillna(df[col].median())
    else:
        # Impute categorical with mode
        df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after imputation:")
print(df.isnull().sum())


Missing values after imputation:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


# Scaling Numerical Features

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Step 5: Select numeric columns
num_cols = df.select_dtypes(include=np.number).columns

# Standard Scaling
scaler_std = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = scaler_std.fit_transform(df[num_cols])

# Min-Max Scaling
scaler_mm = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = scaler_mm.fit_transform(df[num_cols])

# Display sample of scaled data
print("Standard Scaled data (first 5 rows):")
print(df_standard_scaled[num_cols].head())
print("Min-Max Scaled data (first 5 rows):")
print(df_minmax_scaled[num_cols].head())

Standard Scaled data (first 5 rows):
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  
0       -0.035429  
1       -2.222153  
2       -0.035429  
3       -0.035429  
4       -0.035429  
Min-Max Scaled data (first 5 rows):
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       0.800000       0.00000        

# 2.Encoding Categorical Features

In [5]:
from sklearn.preprocessing import LabelEncoder

# Step 6: One-Hot Encoding for categorical features with < 5 categories
cat_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() <= 5]
df_onehot = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Step 7: Label Encoding for other categorical features
le = LabelEncoder()
cat_cols_others = [col for col in df.select_dtypes(include='object').columns if col not in cat_cols]
df_label = df_onehot.copy()
for col in cat_cols_others:
    df_label[col] = le.fit_transform(df_label[col])

print("Encoded Data Sample:")
print(df_label.head())

Encoded Data Sample:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  capital_gain  capital_loss  hours_per_week  \
0           1             1          2174             0              40   
1           4             0             0             0              13   
2           6             1             0             0              40   
3           6             0             0             0              40   
4          10             5             0             0              40   

   native_country  race_ Asian-Pac-Islander  race_ Black  race_ Other  \
0              39           

Discussion:
One-Hot Encoding creates binary columns for each category—best for variables with a few categories and avoids introducing order where it doesn’t exist.
Label Encoding assigns numbers to categories, best for tree-based algorithms but can create fake order for linear models.

One-Hot Encoding

Pros:

No Ordinal Relationship: Prevents introduction of artificial ordinal relationships among categorical values, which is essential for linear models and algorithms sensitive to feature scales.

Interpretability: Each category gets its own column, making the encoding easily interpretable.

Compatibility: Works well with algorithms that can’t handle categorical variables directly, such as linear regression and logistic regression.

Cons:

Dimensionality Explosion: Creates a new column for each category, which can lead to a very high-dimensional dataset if a feature has many unique values.

Increased Memory and Runtime: The resulting sparse matrices can slow down training and use more memory.

Not Suitable for High Cardinality: Not efficient for categorical features with a large number of categories.

Label Encoding

Pros:

Simplicity: Converts categories into numeric codes, which is easy to implement and requires less memory.

No Extra Columns: Maintains the original number of columns in the dataset, resulting in efficient storage and computation.

Works Well with Tree-Based Models: Algorithms such as decision trees and random forests treat label values as distinct categories, so no ordinal relationship is imposed.

Cons:

Artificial Ordinal Relationship: May introduce unintended ordinal relationships among categories, which can negatively affect models sensitive to feature value order (e.g., linear models).

Misinterpretation Risk: Linear or distance-based algorithms might assume that higher label values have greater importance or relationship.

# 3.Feature Engineering

In [6]:
# Example 1: Age group feature
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100], labels=['Youth', 'Adult', 'Mid-Age', 'Senior'])

# Example 2: Working hours category
# Use the correct column name - check your DataFrame's columns!
df['hours_per_week_group'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, np.inf], labels=['Part-time', 'Full-time', 'Overtime', 'Extreme'])

print("New feature samples:")
print(df[['age', 'age_group', 'hours_per_week', 'hours_per_week_group']].head())

New feature samples:
   age age_group  hours_per_week hours_per_week_group
0   39     Adult              40            Full-time
1   50   Mid-Age              13            Part-time
2   38     Adult              40            Full-time
3   53   Mid-Age              40            Full-time
4   28     Adult              40            Full-time


# Data Transformation (e.g., Log transformation for skewed features)

In [7]:
# Step 9: Check columns and skewness
print(df.columns.tolist())  # Check actual column names

# Use the correct column name below
print("Skewness before transformation:", df['capital_gain'].skew())
df['capital_gain_log'] = np.log1p(df['capital_gain'])
print("Skewness after log transformation:", df['capital_gain_log'].skew())

['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income', 'age_group', 'hours_per_week_group']
Skewness before transformation: 11.953847687699799
Skewness after log transformation: 3.096143524467517


Justification for Log Transformation on a Skewed Numerical Feature

I applied a log transformation to the capital_gain feature because its distribution was highly skewed, with most individuals having low or zero values and a few with very large gains. Such skewness can hinder model performance and cause instability during training, especially for algorithms sensitive to data distribution. By using a log transformation, I effectively reduced the impact of outliers, compressed the range of values, and made the feature distribution more normal-like. This helps improve the predictive accuracy and stability of machine learning models, ensures that extreme values do not disproportionately influence outcomes, and enhances interpretability. Therefore, the log transformation is an appropriate preprocessing step for numerical features with significant skewness.