##### DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

#### Explore Data

In [3]:
import pandas as pd

In [4]:
##Load Dataset
df = pd.read_csv("adult_with_headers.csv")

In [5]:
#check first few rows datasets
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
#check for shape of the dataset
df.shape

(32561, 15)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
#check basic statistics
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


#### Data Cleaning

In [9]:
##check for missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [10]:
#checked for duplicate values
df.duplicated().sum()

np.int64(24)

In [11]:
df = df.drop_duplicates()

In [12]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [13]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

#### Feature Engineering

In [14]:
df['age'] = pd.to_numeric(df['age'], errors='coerce')

In [15]:
## get all the categorical features
categorical_features = df.select_dtypes('O').columns
categorical_features

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [16]:
## create a loop to get all number of unique values in each categorical variable
for column in categorical_features:
    unique_values = df[column].nunique()
    print(f"Unique values in '{column}':")
    
    print(unique_values)
   

Unique values in 'workclass':
9
Unique values in 'education':
16
Unique values in 'marital_status':
7
Unique values in 'occupation':
15
Unique values in 'relationship':
6
Unique values in 'race':
5
Unique values in 'sex':
2
Unique values in 'native_country':
42
Unique values in 'income':
2


In [17]:
df['sex'].value_counts()

sex
Male      21775
Female    10762
Name: count, dtype: int64

In [18]:
df['income'].value_counts()

income
<=50K    24698
>50K      7839
Name: count, dtype: int64

In [19]:
## Encode categorical features
from sklearn.preprocessing import OneHotEncoder

In [20]:
catleassthan2_features = df[['sex','income']]

In [21]:
lst = []
for column in categorical_features:
    unique_values = df[column].nunique()
    #print(f"Unique values in '{column}':")
    #print(unique_values )
    if unique_values >= 5:
        lst.append(column)


In [22]:
lst

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'native_country']

In [23]:
##let's apply One-Hot Encoding to categorical variables with less than 5 categories
#we have 2 variabls with less than 5 categories
encoder = OneHotEncoder(sparse_output=False)
ohe = encoder.fit_transform(catleassthan2_features)

In [24]:
ohe_df = pd.DataFrame(ohe, columns=encoder.get_feature_names_out())
ohe_df.head()

Unnamed: 0,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0
3,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0


In [25]:
##let's apply label Encoding to categorical variables with more than or equaul to 5 categories
from sklearn.preprocessing import LabelEncoder

In [37]:
encoder = LabelEncoder()
df['workclass'] = encoder.fit_transform(df['workclass'] )
df['education'] = encoder.fit_transform(df['education'] )
df['marital_status'] = encoder.fit_transform(df['marital_status'] )
df['occupation'] = encoder.fit_transform(df['occupation'] )
df['relationship'] = encoder.fit_transform(df['relationship'] )
df['race'] = encoder.fit_transform(df['race'] )
df['native_country'] = encoder.fit_transform(df['native_country'] )

In [42]:
df_encoded=df.join(ohe_df).drop(df[['sex','income']],axis=1)

In [43]:
df_encoded

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,0.0,1.0,1.0,0.0
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,0.0,1.0,1.0,0.0
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,0.0,1.0,1.0,0.0
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,0.0,1.0,1.0,0.0
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,38,39,,,,
32557,40,4,154374,11,9,2,7,0,4,0,0,40,39,,,,
32558,58,4,151910,11,9,6,1,4,4,0,0,40,39,,,,
32559,22,4,201490,11,9,4,1,3,4,0,0,20,39,,,,


In [44]:
# ##Apply scaling techniques to numerical features:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [47]:
scaler = StandardScaler()
scaled_df=scaler.fit_transform(df_encoded)
scaled_df

array([[ 0.03038995,  2.14997135, -1.06356884, ...,  0.70301655,
         0.56331973, -0.56331973],
       [ 0.8369732 ,  1.46335725, -1.00866848, ...,  0.70301655,
         0.56331973, -0.56331973],
       [-0.0429358 ,  0.09012905,  0.24503992, ...,  0.70301655,
         0.56331973, -0.56331973],
       ...,
       [ 1.42357919,  0.09012905, -0.35877886, ...,         nan,
                nan,         nan],
       [-1.2161478 ,  0.09012905,  0.11092955, ...,         nan,
                nan,         nan],
       [ 0.9836247 ,  0.77674315,  0.92981187, ...,         nan,
                nan,         nan]])

In [46]:
m_scaler = MinMaxScaler()
m_scaler.fit_transform(df_encoded)

array([[0.30136986, 0.875     , 0.0443019 , ..., 1.        , 1.        ,
        0.        ],
       [0.45205479, 0.75      , 0.0482376 , ..., 1.        , 1.        ,
        0.        ],
       [0.28767123, 0.5       , 0.13811345, ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.56164384, 0.5       , 0.09482688, ...,        nan,        nan,
               nan],
       [0.06849315, 0.5       , 0.12849934, ...,        nan,        nan,
               nan],
       [0.47945205, 0.625     , 0.18720338, ...,        nan,        nan,
               nan]])

### Feature Selection

In [48]:
## Let's apply Isolation Forest algorithm to identify and remove outliers. 
from sklearn.ensemble import IsolationForest

In [49]:
iso_forest = IsolationForest(random_state=12)

In [50]:
iso_forest.fit(df_encoded)

In [53]:
#predict the outliers
outliers = iso_forest.predict(df_encoded)

In [54]:
df_encoded['is_outlier'] = outliers

In [55]:
# Remove the outliers
df_encoded_no_outliers = df_encoded[df_encoded['is_outlier'] == 1]

In [59]:
df_encoded_no_outliers

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,is_outlier
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,0.0,1.0,1.0,0.0,1
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,0.0,1.0,1.0,0.0,1
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,0.0,1.0,1.0,0.0,1
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,0.0,1.0,1.0,0.0,1
5,37,4,284582,12,14,2,4,5,4,0,0,40,39,1.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,38,39,,,,,1
32557,40,4,154374,11,9,2,7,0,4,0,0,40,39,,,,,1
32558,58,4,151910,11,9,6,1,4,4,0,0,40,39,,,,,1
32559,22,4,201490,11,9,4,1,3,4,0,0,20,39,,,,,1
