In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns




In [2]:
penguins = sns.load_dataset('penguins')
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [3]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [4]:
num_cols = penguins.select_dtypes('number').columns
print("Numberical Columns:", num_cols.tolist())

Numberical Columns: ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']


In [5]:
cat_cols = penguins.select_dtypes('object').columns
print("Categorical Columns:", cat_cols.tolist())

Categorical Columns: ['species', 'island', 'sex']


In [6]:
num_imp = SimpleImputer(strategy='mean') #default strategy is 'mean'
cat_imp = SimpleImputer(strategy='most_frequent')

In [7]:
penguins[num_cols] = num_imp.fit_transform(penguins[num_cols])
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           0
sex                  11
dtype: int64

In [8]:
penguins[['sex']] = cat_imp.fit_transform(penguins[['sex']])
penguins.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [9]:
sex_enc = LabelEncoder()
penguins['sex'] = sex_enc.fit_transform(penguins['sex'])


In [10]:
print(penguins['sex'])

0      1
1      0
2      0
3      1
4      0
      ..
339    1
340    0
341    1
342    0
343    1
Name: sex, Length: 344, dtype: int64


Dropping the first column hels avoid the dummy variable trap.
When we perform One-Hot Encoding, a new column is created for every category. If you encode n categories, you will get n columns. But one of those  columns can always be predicted from the combination of the other columns. This is known as the dummy variable trap, and it creates multicollinearity (which is a problem for linear models)

Multicollinearity is a  is a statistical phenomenon that occurs when :
One or more independent variables (features) are strongly correlated with other independent variables.

In [11]:
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[['species', 'island']]).toarray()
dummy_df = pd.DataFrame(dummy_cols)

In [12]:
dummy_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0
...,...,...,...,...
339,0.0,1.0,0.0,0.0
340,0.0,1.0,0.0,0.0
341,0.0,1.0,0.0,0.0
342,0.0,1.0,0.0,0.0


In [13]:
penguins['species'].value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

In [14]:
penguins['island'].value_counts()

island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

In [15]:
cat_enc = OneHotEncoder(drop='first')
dummy_cols = cat_enc.fit_transform(penguins[cat_cols]).toarray()
dummy_df = pd.DataFrame(dummy_cols)

In [16]:
dummy_df

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
339,0.0,1.0,0.0,0.0,1.0
340,0.0,1.0,0.0,0.0,0.0
341,0.0,1.0,0.0,0.0,1.0
342,0.0,1.0,0.0,0.0,0.0


In [17]:
clean_df = pd.concat([penguins, dummy_df], axis=1).drop(columns=cat_cols)
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,4
0,39.10000,18.70000,181.000000,3750.000000,0.0,0.0,0.0,1.0,1.0
1,39.50000,17.40000,186.000000,3800.000000,0.0,0.0,0.0,1.0,0.0
2,40.30000,18.00000,195.000000,3250.000000,0.0,0.0,0.0,1.0,0.0
3,43.92193,17.15117,200.915205,4201.754386,0.0,0.0,0.0,1.0,1.0
4,36.70000,19.30000,193.000000,3450.000000,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,43.92193,17.15117,200.915205,4201.754386,0.0,1.0,0.0,0.0,1.0
340,46.80000,14.30000,215.000000,4850.000000,0.0,1.0,0.0,0.0,0.0
341,50.40000,15.70000,222.000000,5750.000000,0.0,1.0,0.0,0.0,1.0
342,45.20000,14.80000,212.000000,5200.000000,0.0,1.0,0.0,0.0,0.0


Scaling Numeric Values


In [18]:
scaler = StandardScaler()
clean_df[num_cols] = scaler.fit_transform(clean_df[num_cols])
clean_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,0,1,2,3,4
0,-8.870812e-01,7.877425e-01,-1.422488,-0.565789,0.0,0.0,0.0,1.0,1.0
1,-8.134940e-01,1.265563e-01,-1.065352,-0.503168,0.0,0.0,0.0,1.0,0.0
2,-6.663195e-01,4.317192e-01,-0.422507,-1.192003,0.0,0.0,0.0,1.0,0.0
3,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,0.0,0.0,1.0,1.0
4,-1.328605e+00,1.092905e+00,-0.565361,-0.941517,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
339,-1.307172e-15,1.806927e-15,0.000000,0.000000,0.0,1.0,0.0,0.0,1.0
340,5.294731e-01,-1.450118e+00,1.006038,0.811880,0.0,1.0,0.0,0.0,0.0
341,1.191758e+00,-7.380718e-01,1.506028,1.939064,0.0,1.0,0.0,0.0,1.0
342,2.351241e-01,-1.195816e+00,0.791756,1.250229,0.0,1.0,0.0,0.0,0.0


In [19]:
#PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [20]:
df = sns.load_dataset('penguins')
num_cols = df.select_dtypes('number').columns
cat_cols = df.select_dtypes('object').columns

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),   
    ('onehot', OneHotEncoder(drop='first')) 
])

In [21]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [22]:
preprocessor.fit_transform(df)

array([[-0.88708123,  0.78774251, -1.42248782, ...,  0.        ,
         1.        ,  1.        ],
       [-0.81349399,  0.12655633, -1.06535169, ...,  0.        ,
         1.        ,  0.        ],
       [-0.66631952,  0.43171918, -0.42250666, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.1917582 , -0.73807176,  1.50602843, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.23512413, -1.19581604,  0.79175618, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.09977416, -0.53462985,  0.8631834 , ...,  0.        ,
         0.        ,  1.        ]], shape=(344, 9))