# Importing Library 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter('ignore')

In [2]:
# pip install numpy pandas matplotlib seaborn ppscore

## 1. Data Exploration and Preprocessing:

Load the dataset and conduct basic data exploration (summary statistics, missing values, data types).

In [3]:
df = pd.read_csv("adult_with_headers.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
print("Length of the Dataset :" +str(len(df)))

Length of the Dataset :32561


In [5]:
df.shape

(32561, 15)

In [6]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [7]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

Handle missing values as per the best practices (imputation, removal, etc.).

In [8]:
df.dropna()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [9]:
# Example: Impute missing values
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)



Apply scaling techniques to numerical features:

Standard Scaling

Min-Max Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [11]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
df[numerical_cols] = minmax_scaler.fit_transform(df[numerical_cols])


Discuss the scenarios where each scaling technique is preferred and why.

Answer :- 
Standard Scaling
    Preferred if data is roughly Gaussian.Ensures each feature contributes equally in models sensitive to variance.

Min-Max Scaling : - 
    Preferred when distribution is not normal.Preserves shape of the original distribution but compresses into [0,1].
Useful when algorithm relies on absolute magnitudes.


## 2. Encoding Techniques:

Apply One-Hot Encoding to categorical variables with less than 5 categories.

In [12]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

onehot_cols = [col for col in categorical_cols if col in df.columns and df[col].nunique() <= 5]
df = pd.get_dummies(df, columns=onehot_cols, drop_first=True)


In [13]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Male,income_ >50K
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,0.02174,0.0,0.397959,United-States,0,0,0,1,1,0
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,0.0,0.0,0.122449,United-States,0,0,0,1,1,0
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,0.0,0.0,0.397959,United-States,0,0,0,1,1,0
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,0.0,0.0,0.397959,United-States,0,1,0,0,1,0
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,0.0,0.0,0.397959,Cuba,0,1,0,0,0,0


Label Encoding (>5 categories)

In [14]:
from sklearn.preprocessing import LabelEncoder

# Only include columns that exist in df and have >5 unique values
label_cols = [col for col in categorical_cols if col in df.columns and df[col].nunique() > 5]

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


## Discuss the pros and cons of One-Hot Encoding and Label Encoding.
Advantages and Disadvantages of One Hot Encoding

Advantages of Using One Hot Encoding
1.  It allows the use of categorical variables in models that require numerical input.
2.  It can improve model performance by providing more information to the model about the categorical variable.
3.  It can help to avoid the problem of ordinality which can occur when a categorical variable has a natural ordering (e.g. "small", "medium", "large").

Disadvantages of Using One Hot Encoding
1.  It can lead to increased dimensionality as a separate column is created for each category in the variable. This can make the model more complex and   slow to train.
2.  It can lead to sparse data as most observations will have a value of 0 in most of the one-hot encoded columns.
3.  It can lead to overfitting especially if there are many categories in the variable and the sample size is relatively small.

Advantages and Disadvantages of Label Encoding
1.  Encoding ordinal features: Numbers can capture the inherent order of categories.
2.  Using tree-based algorithms: Models like decision trees or random forests are insensitive to numerical order assumptions.
3.  Memory efficiency is critical: Each category is stored as a single integer, unlike one-hot encoding which expands data into multiple columns.

Disadvantages : 
1.  Nominal data misinterpretation: Encoded integers can imply false order; one-hot encoding is safer for nominal features.
    Missing values: These must be handled prior to encoding.
2.  Unseen categories in test data: Encoders will fail if new categories appear; handle this with a default value or ensure training includes all possible categories.
3.  High cardinality: Features with many unique categories may still require additional feature engineering.

## 3. Feature Engineering:

Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.

In [15]:
df['hours_per_week_log'] = df['hours_per_week'] * df['education_num']  # Example


In [16]:
df['capital_gain_ratio'] = df['capital_gain'] / (df['capital_gain'] + df['capital_loss'] + 1)


Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.

In [17]:
# Capital net feature
print(df['capital_gain'].skew())


# Log transformation
import numpy as np
df['capital_gain_log'] = np.log1p(df['capital_gain'])


11.953847687699781


4. Feature Selection:

Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.

In [18]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.01, random_state=42)
outliers = iso.fit_predict(df[numerical_cols])
df['outlier'] = outliers
df = df[df['outlier'] == 1]  # Keep non-outliers


In [19]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'sex_ Male', 'income_ >50K', 'hours_per_week_log', 'capital_gain_ratio',
       'capital_gain_log', 'outlier'],
      dtype='object')

Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.

In [20]:
pip install ppscore

Note: you may need to restart the kernel to use updated packages.


In [21]:
import ppscore as pps

# Correlation matrix
corr = df.corr()
print(corr['income_ >50K'])  # Relationship with target

# PPS matrix
pps_matrix = pps.matrix(df)
print(pps_matrix[pps_matrix['y']=='income_ >50K'])


age                         0.228907
workclass                   0.048180
fnlwgt                     -0.010217
education                   0.076781
education_num               0.326149
marital_status             -0.198139
occupation                  0.074256
relationship               -0.248573
capital_gain                0.309171
capital_loss                0.147054
hours_per_week              0.224111
native_country              0.016644
race_ Asian-Pac-Islander    0.009379
race_ Black                -0.087336
race_ Other                -0.032277
race_ White                 0.084295
sex_ Male                   0.214057
income_ >50K                1.000000
hours_per_week_log          0.366137
capital_gain_ratio          0.317669
capital_gain_log            0.314397
outlier                          NaN
Name: income_ >50K, dtype: float64
                            x             y  ppscore            case  \
17                        age  income_ >50K      0.0      regression   
39     