# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.ensemble import IsolationForest

# 1. Data exploration and data processing

In [3]:
# load the dataset
df=pd.read_csv('adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [7]:
# check the missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

# Scaling numerical features

In [9]:
numerical_features= df.select_dtypes(include=['int64','float64']).columns
numerical_features

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

## Standard Scaler

In [11]:
standard_sca=StandardScaler()

In [12]:
df_standard_scaled=df.copy()

In [19]:
# apply standard scaler to numerical features

In [24]:
df_standard_scaled[numerical_features]=standard_sca.fit_transform(df[numerical_features])

In [26]:
df_standard_scaled[numerical_features]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [28]:
df_standard_scaled.head() # after using standard scaling

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


## Min-Max Scaler

In [31]:
minmax_scaler=MinMaxScaler()

In [33]:
df_minmax_scaled=df.copy()

In [35]:
df_minmax_scaled[numerical_features]=minmax_scaler.fit_transform(df[numerical_features])

In [37]:
df_minmax_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


In [39]:
#standard scaler and min-max scaler values
print(df_minmax_scaled.head())
print(df_standard_scaled.head())

        age          workclass    fnlwgt   education  education_num  \
0  0.301370          State-gov  0.044302   Bachelors       0.800000   
1  0.452055   Self-emp-not-inc  0.048238   Bachelors       0.800000   
2  0.287671            Private  0.138113     HS-grad       0.533333   
3  0.493151            Private  0.151068        11th       0.400000   
4  0.150685            Private  0.221488   Bachelors       0.800000   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0       0.02174           

# 2.Encoding Techniques:

# One-Hot Encoding to Categorical Columns

In [42]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories.

In [44]:
#lets apply one hot encoding to 2 categorical columns
one_hot_colums=['sex','income']

In [46]:
ohe=OneHotEncoder()

In [48]:
print(ohe)

OneHotEncoder()


In [50]:
ohe.fit_transform(df[['sex','income']]).toarray()

array([[0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       ...,
       [1., 0., 1., 0.],
       [0., 1., 1., 0.],
       [1., 0., 0., 1.]])

In [52]:
df_ohe=ohe.fit_transform(df[['sex','income']]).toarray()

In [54]:
ohe.categories_

[array([' Female', ' Male'], dtype=object),
 array([' <=50K', ' >50K'], dtype=object)]

In [56]:
ohe.get_feature_names_out(one_hot_colums)

array(['sex_ Female', 'sex_ Male', 'income_ <=50K', 'income_ >50K'],
      dtype=object)

In [58]:
# lets put one hot encoded values into dataframe df1.
df1=pd.DataFrame(df_ohe,columns=ohe.get_feature_names_out(one_hot_colums))
df1

Unnamed: 0,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0
3,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0
...,...,...,...,...
32556,1.0,0.0,1.0,0.0
32557,0.0,1.0,0.0,1.0
32558,1.0,0.0,1.0,0.0
32559,0.0,1.0,1.0,0.0


In [60]:
#lets join df and df1 together and make it into new dataframe

In [62]:
df_new=df.join(df1)

In [64]:
df_new.drop(columns=one_hot_colums)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,0.0,1.0,1.0,0.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,0.0,1.0,1.0,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,0.0,1.0,1.0,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,0.0,1.0,1.0,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,0,0,38,United-States,1.0,0.0,1.0,0.0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,0,0,40,United-States,0.0,1.0,0.0,1.0
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,0,0,40,United-States,1.0,0.0,1.0,0.0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,0,0,20,United-States,0.0,1.0,1.0,0.0


# Label Encoding to more than 5 categorical columns

In [67]:
#Use Label Encoding for categorical variables with more than 5 categories.

In [73]:
categorical_cols=df.select_dtypes(include=['object']).columns
categorical_cols

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [100]:
columns_to_label_encode = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']

In [75]:
label_encoder=LabelEncoder()

In [102]:
# apply label encoding to each selected column

for col in columns_to_label_encode:
    df[col]=label_encoder.fit_transform(df[col])
print(df.head())

   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0             0             0   

   hours_per_week  native_country  income  
0              40              39       0  
1              13              39       0  
2              40   

In [104]:
df  # after applyin label encoding

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
32557,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
32558,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
32559,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


## Pros and Cons:

In [None]:

# One-Hot Encoding:
# Pros: Preserves all categories without imposing any order. Useful for algorithms that work better with non-ordinal data.
# Cons: Can lead to high-dimensional data when there are many categories.


In [107]:
# Label Encoding:
# Pros: Simple, works well with ordinal data. Suitable when categories have a meaningful order.
# Cons: May confuse models into thinking that one category is "greater" or "lesser" than another due to assigned numerical values.

# 3.Feature Engineering

In [110]:
# Create at least 2 new features that could be beneficial for the model.

In [None]:
# We can create new features based on existing data that could potentially improve model performance.

In [116]:
# Feature 1: Work Experience Category based on age
df['work_experience'] = df['age'].apply(lambda age: 'early_career' if age < 30 else 'mid_career' if age <= 50 else 'late_career')

In [126]:
# Feature 2: Family Size based on relationship status
df['family_size'] = df['relationship'].apply(lambda x: 'large_family' if x in ['Husband', 'Wife'] else 'small_family')


In [128]:
# Display the new features
print(df[['age', 'work_experience', 'relationship', 'family_size']].head())


   age work_experience  relationship   family_size
0   39      mid_career             1  small_family
1   50      mid_career             0  small_family
2   38      mid_career             1  small_family
3   53     late_career             0  small_family
4   28    early_career             5  small_family


In [122]:
# 1. Work Experience Category (based on age):
# Group people into three categories based on age.


In [130]:
# 2. Family Size (based on relationship status):
# Categorize family size as either "large" or "small" based on relationship status.

In [138]:
# lets make another 2 features

In [140]:
# feature 3: Net capital (capital_gain - capital_loss)
df['net_capital'] = df['capital_gain'] - df['capital_loss']

In [142]:
# Feature 4: Interaction between age and hours_per_week
df['age_hours_interaction'] = df['age'] * df['hours_per_week']

In [144]:
print(df[['net_capital', 'age_hours_interaction']].head())

   net_capital  age_hours_interaction
0         2174                   1560
1            0                    650
2            0                   1520
3            0                   2120
4            0                   1120


In [146]:
#Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.

In [148]:
# Apply log transformation to 'capital_gain' to reduce skewness
df['capital_gain_log'] = np.log1p(df['capital_gain'])  # log1p is used to handle zeros (log(1+x))

In [150]:
# Display the original and transformed feature
print(df[['capital_gain', 'capital_gain_log']].head())

   capital_gain  capital_gain_log
0          2174          7.684784
1             0          0.000000
2             0          0.000000
3             0          0.000000
4             0          0.000000


In [156]:
# lets take 'work experience' feature for log transformation

In [158]:
# Step 1: Map 'work_experience' to numerical values
experience_mapping = {'early_career': 1, 'mid_career': 2, 'late_career': 3}
df['work_experience_numeric'] = df['work_experience'].map(experience_mapping)

In [162]:
# Step 2: Apply log transformation to the mapped 'work_experience' values
df['work_experience_log'] = np.log1p(df['work_experience_numeric'])

In [164]:
# Display the original and transformed 'work_experience' columns
print(df[['work_experience', 'work_experience_numeric', 'work_experience_log']].head())

  work_experience  work_experience_numeric  work_experience_log
0      mid_career                        2             1.098612
1      mid_career                        2             1.098612
2      mid_career                        2             1.098612
3     late_career                        3             1.386294
4    early_career                        1             0.693147


# 4. Feature Selection:

## Isolation Forest - to detect outliers

In [175]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'net_capital',
       'age_hours_interaction', 'capital_gain_log', 'work_experience_numeric',
       'work_experience_log'],
      dtype='object')

In [177]:
iso_forest= IsolationForest(contamination=0.05,random_state=50) # setting contamination to 5% for exmaple

In [181]:
outliers= iso_forest.fit_predict(df[numerical_columns])
outliers

array([ 1,  1,  1, ...,  1,  1, -1])

In [193]:
# remove outliers
df_cleaned=df[outliers != -1] #The model predicts outliers (-1 for outliers and 1 for inliers).
df_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,...,native_country,income,work_experience,family_size,net_capital,age_hours_interaction,capital_gain_log,work_experience_numeric,work_experience_log,anomaly
0,39,7,77516,9,13,4,1,1,4,1,...,39,0,mid_career,small_family,2174,1560,7.684784,2,1.098612,1
1,50,6,83311,9,13,2,4,0,4,1,...,39,0,mid_career,small_family,0,650,0.000000,2,1.098612,1
2,38,4,215646,11,9,0,6,1,4,1,...,39,0,mid_career,small_family,0,1520,0.000000,2,1.098612,1
3,53,4,234721,1,7,2,6,0,2,1,...,39,0,late_career,small_family,0,2120,0.000000,3,1.386294,1
4,28,4,338409,9,13,2,10,5,2,0,...,5,0,early_career,small_family,0,1120,0.000000,1,0.693147,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,22,4,310152,15,10,4,11,1,4,1,...,39,0,early_career,small_family,0,880,0.000000,1,0.693147,1
32556,27,4,257302,7,12,2,13,5,4,0,...,39,0,early_career,small_family,0,1026,0.000000,1,0.693147,1
32557,40,4,154374,11,9,2,7,0,4,1,...,39,1,mid_career,small_family,0,1600,0.000000,2,1.098612,1
32558,58,4,151910,11,9,6,1,4,4,0,...,39,0,late_career,small_family,0,2320,0.000000,3,1.386294,1


In [187]:
df_cleaned.shape   #size of dataset after removing outliers

(30933, 22)

In [189]:
df.shape   #original dataset size

(32561, 23)

In [196]:
# Display the number of rows before and after removing outliers

print(f"Original dataset size: {df.shape[0]}")
print(f"Cleaned dataset size: {df_cleaned.shape[0]}")

Original dataset size: 32561
Cleaned dataset size: 30933


In [None]:
#Apply the PPS (Predictive Power Score) to find and discuss the relationships between features.

In [218]:
# !pip install ppscore  --- installed pps and commented out 

Collecting ppscoreNote: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Building wheel for pandas (pyproject.toml) did not run successfully.
  exit code: 1
  
  [2514 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-312\pandas
  copying pandas\conftest.py -> build\lib.win-amd64-cpython-312\pandas
  copying pandas\testing.py -> build\lib.win-amd64-cpython-312\pandas
  copying pandas\_typing.py -> build\lib.win-amd64-cpython-312\pandas
  copying pandas\_version.py -> build\lib.win-amd64-cpython-312\pandas
  copying pandas\__init__.py -> build\lib.win-amd64-cpython-312\pandas
  creating build\lib.win-amd64-cpython-312\pandas\api
  copying pandas\api\__init__.py -> build\lib.win-amd64-cpython-312\pandas\api
  creating build\lib.win-amd64-cpython-312\pandas\arrays
  copying pandas\arrays\__init__.py -> build\lib.win-amd64-cpython-312\pandas\arrays
  creating build\lib.win-amd64-cpython-312\pandas\compat
  copying pandas\compat\chainmap.py -> build\lib.win-


  Using cached ppscore-1.3.0-py2.py3-none-any.whl
Collecting pandas
  Using cached pandas-1.5.3.tar.gz (5.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: still running...
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: pandas
  Building wheel for pandas (pyproject.toml): started
  Building wheel for pandas (pyproject.toml): finished with status 'error'
Failed to build pandas
