In [1]:
import pandas as pd
import numpy  as np

In [None]:
df = pd.read_csv('adult_with_headers.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Standard Scaling:

In [None]:
df1 = df.copy()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical = ['age','fnlwgt','education_num','capital_gain','capital_loss','hours_per_week']
df1[numerical] = scaler.fit_transform(df1[numerical])

In [None]:
# Min_MAx Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df1[numerical] = scaler.fit_transform(df1[numerical])

In [None]:
# Discussion:

###  Standard Scaling is preferred when the data has outliers or when the algorithm assumes a normal distribution.

### Min-Max Scaling is preferred when the data is bounded and the algorithm is sensitive to the scale of the features.

In [None]:
# 2. Encoding Techniques

In [None]:
# Apply One-Hot Encoding to Categorical Variables with Less Than 5 Categories
df = pd.get_dummies(df,columns=['sex','race'], drop_first = True)

In [None]:
df.head()

In [None]:
# Use Label Encoding for Categorical Variables with More Than 5 Categories

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['education'] = label_encoder.fit_transform(df['education'])
df['occupation'] = label_encoder.fit_transform(df['occupation'])
df['workclass'] = label_encoder.fit_transform(df['workclass'])
df['marital_status'] = label_encoder.fit_transform(df['marital_status'])
df['relationship'] = label_encoder.fit_transform(df['relationship'])
df['native_country'] = label_encoder.fit_transform(df['native_country'])
#df['income'] = label_encoder.fit_transform(df['income'])

In [None]:
df.sample(50)

In [None]:
#  Discuss the Pros and Cons of One-Hot Encoding and Label Encoding

In [None]:
# One-Hot Encoding:

## Pros: Preserves the categorical nature of the data, avoids ordinal assumptions.

## Cons: Can lead to high dimensionality (curse of dimensionality) if there are many categories.

# Label Encoding:

## Pros: Reduces dimensionality, suitable for ordinal data.

## Cons: Introduces ordinal relationships where none exist, which can mislead the model.

In [None]:
# Feature Engineering

In [None]:
# 1. Hour_per_week
df['hours_per_week_category'] = pd.cut(df['hours_per_week'], bins=[0, 30, 40, 60, 100],
                                       labels=['part-time', 'full-time', 'overtime', 'extra-overtime'])
# Explanation:

# Part-time: 0–30 hours/week

# Full-time: 31–40 hours/week

# Overtime: 41–60 hours/week

# Extra-overtime: 61+ hours/week
df['hours_per_week_category'].sample(50)

In [None]:
# capital_net
df['capital_net'] = df['capital_gain'] - df['capital_loss']
df['capital_net']
# A positive value indicates a net gain.

# A negative value indicates a net loss.

In [None]:
# age_group
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 25, 40, 60, 100],
    labels=['young', 'mid-career', 'senior', 'retired']
)
df['age_group']
# Explanation:

# Young: 0–25 years

# Mid-career: 26–40 years

# Senior: 41–60 years

# Retired: 61+ years


In [None]:
# Transforming Skewed Features

In [None]:
# Log Transformation for capital_gain and capital_loss
df['capital_gain_log'] = np.log1p(df['capital_gain'])
df['capital_loss_log'] = np.log1p(df['capital_loss'])
df['capital_gain_log']
#df['capital_loss_log']

In [None]:
df['capital_loss_log'] 

In [None]:
#  Square Root Transformation for fnlwgt
df['fnlwgt_sqrt'] = np.sqrt(df['fnlwgt'])
df['fnlwgt_sqrt']

In [None]:
#  Feature Selection:

In [None]:
# Isolation Forest for Outlier Detection

In [None]:
df.info()

In [3]:
df1 = pd.read_csv('adult_with_headers.csv')
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Example: Label Encoding for categorical variables with more than 5 categories
label_encoder = LabelEncoder()
df1['workclass'] = label_encoder.fit_transform(df1['workclass'])
df1['education'] = label_encoder.fit_transform(df1['education'])
df1['marital_status'] = label_encoder.fit_transform(df1['marital_status'])
df1['occupation'] = label_encoder.fit_transform(df1['occupation'])
df1['relationship'] = label_encoder.fit_transform(df1['relationship'])
df1['race'] = label_encoder.fit_transform(df1['race'])
df1['sex'] = label_encoder.fit_transform(df1['sex'])
df1['native_country'] = label_encoder.fit_transform(df1['native_country'])

df1 = pd.get_dummies(df1, columns=['sex'], drop_first=True)
df1.head()

In [None]:
from sklearn.ensemble import IsolationForest

# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42) 

# Fit the model and predict outliers
df1['anomaly'] = iso_forest.fit_predict(df1.drop('income', axis=1))  

# Remove outliers
df_cleaned = df1[df1['anomaly'] == 1]  
df_cleaned = df_cleaned.drop(columns=['anomaly']) 

print(f"Original dataset shape: {df1.shape}")
print(f"Cleaned dataset shape: {df_cleaned.shape}")

In [None]:
!pip install ppscore

In [None]:
# Predictive Power Score (PPS)
import ppscore as pps

# Calculate PPS matrix
pps_matrix = pps.matrix(df1)

# Filter for the target variable
pps_target = pps_matrix[pps_matrix['y'] == 'income']
print(pps_target.sort_values(by='ppscore', ascending=False))

In [None]:
import ppscore as pps

# Calculate PPS matrix
pps_matrix = pps.matrix(df_cleaned)

# Filter for the target variable
pps_target = pps_matrix[pps_matrix['y'] == 'income']

# Display features sorted by PPS score
print(pps_target.sort_values(by='ppscore', ascending=False))