# ***ASSIGNMENT - 9***

## Loading the Dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

data = pd.read_csv("adult_with_headers.csv")

print("\nFirst 5 rows of the dataset:")
print(data.head())


First 5 rows of the dataset:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0         

## Data Exploration

In [2]:
print("\nDataset Information:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe(include='all'))

# Check missing values
print("\nMissing Values in Each Column:")
print(data.isnull().sum())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

Summary Statistics:
                 age workclass        fnlwgt 

# **Task - 1**

## Handling Missing Values

In [3]:
# Replace '?' with NaN (Adult dataset uses '?' for missing values)
data.replace("?", np.nan, inplace=True)

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = data.select_dtypes(include=["object"]).columns

# Impute numerical columns with median
for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("\nMissing Values (after handling):")
print(data.isnull().sum())


Missing Values (after handling):
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


## Scaling Techniques

In [4]:
# Standard Scaling:
standard_scaler = StandardScaler()
data_standard_scaled = data.copy()
data_standard_scaled[numerical_cols] = standard_scaler.fit_transform(
    data_standard_scaled[numerical_cols]
)

# Min-Max Scaling:
minmax_scaler = MinMaxScaler()
data_minmax_scaled = data.copy()
data_minmax_scaled[numerical_cols] = minmax_scaler.fit_transform(
    data_minmax_scaled[numerical_cols]
)

print("\nStandard Scaled Numerical Features (First 5 Rows):")
print(data_standard_scaled[numerical_cols].head())
print("\nMin-Max Scaled Numerical Features (First 5 Rows):")
print(data_minmax_scaled[numerical_cols].head())



Standard Scaled Numerical Features (First 5 Rows):
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  
0       -0.035429  
1       -2.222153  
2       -0.035429  
3       -0.035429  
4       -0.035429  

Min-Max Scaled Numerical Features (First 5 Rows):
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       

### **Standard Scaling** is preferred when:

- Data follows a normal distribution.
- Algorithms like Logistic Regression, SVM, KNN are used.

### **Mean-Max Scalling** is preferred when:
- Data needs to be bounded (0 to 1).
-  Distance-based algorithms or neural networks are used.

# **Task - 2**

## Encoding Techniques

In [5]:
encoded_data = data.copy()
# One-Hot Encoding for categorical columns with < 5 unique values
low_cardinality_cols = [col for col in categorical_cols if data[col].nunique() < 5]

encoded_data = pd.get_dummies(
    encoded_data,
    columns=low_cardinality_cols,
    drop_first=True
)

# Label Encoding for categorical columns with > 5 unique values
high_cardinality_cols = [col for col in categorical_cols if col not in low_cardinality_cols]

label_encoder = LabelEncoder()

for col in high_cardinality_cols:
    encoded_data[col] = label_encoder.fit_transform(encoded_data[col])

### ***Pros & Cons :***
One-Hot encoding:
 - No ordinal relationship introduced.
 - Increases dimensionality.

 Label Encoding:
 - Memory efficient.
 - Introduces ordinal relationship.

# **Task - 3**

## Feature Engineering

In [6]:
# Feature 1: Capital Gain Indicator
encoded_data["capital_gain_flag"] = encoded_data["capital_gain"].apply(
    lambda x: 1 if x > 0 else 0
)

# Feature 2: Age Group Feature
encoded_data["age_group"] = pd.cut(
    encoded_data["age"],
    bins=[0, 25, 45, 65, 100],
    labels=["Young", "Adult", "Senior", "Elder"]
)

# Encode age_group
encoded_data["age_group"] = label_encoder.fit_transform(encoded_data["age_group"])

### **Rationale:**
- Capital gain flag captures income-related behaviour.
- Age groups capture life-stage income patterns.

## Feature Transformation

In [7]:
# Log Transformation on Capital Gain (skewed feature)
encoded_data["log_capital_gain"] = np.log1p(encoded_data["capital_gain"])

### ***Justification***
- Capital gain is highly right-skewed.
- Log tranformatin reduces skewness and improves model learnig.

## Final Dataset Overview

In [8]:
print("\nFinal Preprocessed Dataset Shape:")
print(encoded_data.shape)

print("\nFinal Dataset Sample:")
print(encoded_data.head())


Final Preprocessed Dataset Shape:
(32561, 18)

Final Dataset Sample:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_countr