In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.ensemble import IsolationForest


In [5]:
# Load the dataset
data=pd.read_csv('adult_with_headers.csv')

In [7]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## 1.Data Exploration and Preprocessing

In [11]:
# Check for invalid placeholders and replace '?' with NaN

In [13]:
data_cleaned=data.replace('?',np.nan)

In [15]:
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [17]:
# Handle Missing values by removing rows with missing data
data_cleaned.dropna(inplace=True)

In [19]:
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [21]:
# Display summery Statistics
def summarize_data(df):
    print('\n Summary Statistics:')
    print(df.describe(include='all'))
    print('\n Missing Values:')
    print(df.isnull().sum())

summarize_data(data_cleaned)


 Summary Statistics:
                 age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             marital_status       occupation relationship    race    sex  \
count                 325

In [33]:
# Scaling numerical features
numerical_cols=['age','fnlwgt','education_num','hours_per_week']
scaler_standard=StandardScaler()
scaler_minmax=MinMaxScaler()


In [35]:
# Perform scaling before modifying numerical columns
data_cleaned_standard_scaled = pd.DataFrame(
    scaler_standard.fit_transform(data_cleaned[numerical_cols]),
    columns=[f"{col}_standard" for col in numerical_cols]
)

data_cleaned_minmax_scaled = pd.DataFrame(
    scaler_minmax.fit_transform(data_cleaned[numerical_cols]),
    columns=[f"{col}_minmax" for col in numerical_cols]
)


In [37]:
# Combine Scaled data back with the original dataset
data_cleaned=pd.concat([data_cleaned,data_cleaned_standard_scaled,data_cleaned_minmax_scaled],axis=1)

In [39]:
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,...,native_country,income,age_standard,fnlwgt_standard,education_num_standard,hours_per_week_standard,age_minmax,fnlwgt_minmax,education_num_minmax,hours_per_week_minmax
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,United-States,<=50K,0.030671,-1.063611,1.134739,-0.035429,0.301370,0.044302,0.800000,0.397959
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,United-States,<=50K,0.837109,-1.008707,1.134739,-2.222153,0.452055,0.048238,0.800000,0.122449
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,United-States,<=50K,-0.042642,0.245079,-0.420060,-0.035429,0.287671,0.138113,0.533333,0.397959
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,United-States,<=50K,1.057047,0.425801,-1.197459,-0.035429,0.493151,0.151068,0.400000,0.397959
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,Cuba,<=50K,-0.775768,1.408176,1.134739,-0.035429,0.150685,0.221488,0.800000,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,...,United-States,<=50K,-0.849080,0.639741,0.746039,-0.197409,0.136986,0.166404,0.733333,0.377551
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,...,United-States,>50K,0.103983,-0.335433,-0.420060,-0.035429,0.315068,0.096500,0.533333,0.397959
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,...,United-States,<=50K,1.423610,-0.358777,-0.420060,-0.035429,0.561644,0.094827,0.533333,0.397959
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,...,United-States,<=50K,-1.215643,0.110960,-0.420060,-1.655225,0.068493,0.128499,0.533333,0.193878


In [None]:
## Discuss the scenarios where each scaling technique is preferred and why.
#Scaling Techniques: Scenarios and Preferences
#Standard Scaling:
#Scenario: Data follows a normal distribution or is assumed to be approximately Gaussian.
#Why: Standard scaling centers data around a mean of 0 with a standard deviation of 1, which is ideal for algorithms like Support Vector Machines (SVM) and Principal Component Analysis (PCA) that are sensitive to feature magnitudes.

#Min-Max Scaling:
#Scenario: Data needs to be normalized to a fixed range (e.g., [0, 1]) or when working with algorithms that don't assume a normal distribution (e.g., Neural Networks, KNN).
#Why: Min-Max Scaling preserves the data's original distribution while transforming values into a uniform range, which helps avoid dominance of large magnitude features. It’s also useful when feature scaling is required for bounded inputs like image pixel intensities.

## 2.Encoding Techniques

In [42]:
# Identify Categorical Columns
categorical_cols=data_cleaned.select_dtypes(include=['object']).columns

In [44]:
categorical_cols

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [46]:
# Apply One_hot encoding for variables with <5 categories
one_hot_cols=[col for col in categorical_cols if data_cleaned[col].nunique()<5]
data_cleaned=pd.get_dummies(data_cleaned,columns=one_hot_cols,drop_first=True)

In [52]:
# Apply Label Encoding for variables with>=5 categories
label_encoder = LabelEncoder()
label_columns = [col for col in categorical_cols if col not in one_hot_cols]
for col in label_columns:
    data_cleaned[col] = label_encoder.fit_transform(data_cleaned[col])

In [54]:
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,...,age_standard,fnlwgt_standard,education_num_standard,hours_per_week_standard,age_minmax,fnlwgt_minmax,education_num_minmax,hours_per_week_minmax,sex_ Male,income_ >50K
0,39,7,77516,9,13,4,1,1,4,2174,...,0.030671,-1.063611,1.134739,-0.035429,0.301370,0.044302,0.800000,0.397959,True,False
1,50,6,83311,9,13,2,4,0,4,0,...,0.837109,-1.008707,1.134739,-2.222153,0.452055,0.048238,0.800000,0.122449,True,False
2,38,4,215646,11,9,0,6,1,4,0,...,-0.042642,0.245079,-0.420060,-0.035429,0.287671,0.138113,0.533333,0.397959,True,False
3,53,4,234721,1,7,2,6,0,2,0,...,1.057047,0.425801,-1.197459,-0.035429,0.493151,0.151068,0.400000,0.397959,True,False
4,28,4,338409,9,13,2,10,5,2,0,...,-0.775768,1.408176,1.134739,-0.035429,0.150685,0.221488,0.800000,0.397959,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,...,-0.849080,0.639741,0.746039,-0.197409,0.136986,0.166404,0.733333,0.377551,False,False
32557,40,4,154374,11,9,2,7,0,4,0,...,0.103983,-0.335433,-0.420060,-0.035429,0.315068,0.096500,0.533333,0.397959,True,True
32558,58,4,151910,11,9,6,1,4,4,0,...,1.423610,-0.358777,-0.420060,-0.035429,0.561644,0.094827,0.533333,0.397959,False,False
32559,22,4,201490,11,9,4,1,3,4,0,...,-1.215643,0.110960,-0.420060,-1.655225,0.068493,0.128499,0.533333,0.193878,True,False


In [None]:
##•	Discuss the pros and cons of One-Hot Encoding and Label Encoding.
# One-Hot Encoding:
#Pros:
#No Ordinal Relationship Assumed: Treats each category as a separate entity, making it useful when no inherent order exists.
#Prevents Model Bias: Each category gets equal weight in the model, reducing bias.
#Versatility: Works well with many machine learning models like decision trees, linear models, and neural networks.
#Cons:
#High Dimensionality: For categories with many levels, it leads to a large number of features, which can increase computational cost and risk overfitting.
#Sparsity: Many zeros in the feature matrix, which can make storage inefficient.

##Label Encoding:
#Pros:
#Efficient Storage: Converts categories into a smaller range of integers, leading to lower dimensionality.
#Simple and Fast: Easier to implement and computationally efficient.
#Cons:
#Imposes Ordinality: May mislead the model into thinking that there's a rank or order to the categories, which is not always correct.
#Not Suitable for Non-Ordinal Categories: If the categories are nominal (no order), label encoding may cause inaccurate relationships in models.


## 3.Feature Engineering

In [57]:
# Create two new features
data_cleaned['capital_diff'] = data_cleaned['capital_gain'] - data_cleaned['capital_loss']
data_cleaned['hours_per_week_bins'] = pd.cut(data_cleaned['hours_per_week'], bins=[0, 20, 40, 60, 100], labels=['Low', 'Medium', 'High', 'Very High'])

In [63]:
# Apply a log transformation to skewed features
data_cleaned['capital_gain_log'] = np.log1p(data_cleaned['capital_gain'])


## 4.Feature Seletion

In [66]:
# Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outlier_preds = iso_forest.fit_predict(data_cleaned[numerical_cols])


In [68]:
outlier_preds

array([1, 1, 1, ..., 1, 1, 1])

In [70]:
# Remove Outliers
data_cleaned=data_cleaned[outlier_preds==1]

In [72]:
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,...,hours_per_week_standard,age_minmax,fnlwgt_minmax,education_num_minmax,hours_per_week_minmax,sex_ Male,income_ >50K,capital_diff,hours_per_week_bins,capital_gain_log
0,39,7,77516,9,13,4,1,1,4,2174,...,-0.035429,0.301370,0.044302,0.800000,0.397959,True,False,2174,Medium,7.684784
1,50,6,83311,9,13,2,4,0,4,0,...,-2.222153,0.452055,0.048238,0.800000,0.122449,True,False,0,Low,0.000000
2,38,4,215646,11,9,0,6,1,4,0,...,-0.035429,0.287671,0.138113,0.533333,0.397959,True,False,0,Medium,0.000000
3,53,4,234721,1,7,2,6,0,2,0,...,-0.035429,0.493151,0.151068,0.400000,0.397959,True,False,0,Medium,0.000000
4,28,4,338409,9,13,2,10,5,2,0,...,-0.035429,0.150685,0.221488,0.800000,0.397959,False,False,0,Medium,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,...,-0.197409,0.136986,0.166404,0.733333,0.377551,False,False,0,Medium,0.000000
32557,40,4,154374,11,9,2,7,0,4,0,...,-0.035429,0.315068,0.096500,0.533333,0.397959,True,True,0,Medium,0.000000
32558,58,4,151910,11,9,6,1,4,4,0,...,-0.035429,0.561644,0.094827,0.533333,0.397959,False,False,0,Medium,0.000000
32559,22,4,201490,11,9,4,1,3,4,0,...,-1.655225,0.068493,0.128499,0.533333,0.193878,True,False,0,Low,0.000000


In [80]:
# Correlation matrix analysis
# Ensure only numeric columns are used for correlation
numeric_cols = data_cleaned.select_dtypes(include=[np.number]).columns
correlation_matrix = data_cleaned[numeric_cols].corr()
correlation_matrix_filtered = correlation_matrix[(correlation_matrix > 0.2) | (correlation_matrix < -0.2)]


In [82]:
# Output results
print("\nFiltered Correlation Matrix:")
print(correlation_matrix_filtered)


Filtered Correlation Matrix:
                              age  workclass  fnlwgt  education  \
age                      1.000000        NaN     NaN        NaN   
workclass                     NaN   1.000000     NaN        NaN   
fnlwgt                        NaN        NaN     1.0        NaN   
education                     NaN        NaN     NaN   1.000000   
education_num                 NaN        NaN     NaN   0.349249   
marital_status          -0.282289        NaN     NaN        NaN   
occupation                    NaN   0.247565     NaN        NaN   
relationship            -0.264710        NaN     NaN        NaN   
race                          NaN        NaN     NaN        NaN   
capital_gain                  NaN        NaN     NaN        NaN   
capital_loss                  NaN        NaN     NaN        NaN   
hours_per_week                NaN        NaN     NaN        NaN   
native_country                NaN        NaN     NaN        NaN   
age_standard             1.00000