In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from termcolor import colored

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.smoker.unique()

array(['yes', 'no'], dtype=object)

In [4]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [5]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [6]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# 1. Label Encoder

In [7]:
enc = LabelEncoder()

#### 1.1 Using fundamental approach

In [8]:
df1 = df.copy()

df1.sex = enc.fit_transform(df1[['sex']])
df1.smoker = enc.fit_transform(df1[['smoker']])
df1.region = enc.fit_transform(df1[['region']])

print(colored('Label Encoded DataFrame Using fundamental approach:\n',color='blue',attrs = ['bold','dark']))
print(colored(df1.head(),color= 'light_yellow'))

[2m[1m[34mLabel Encoded DataFrame Using fundamental approach:
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


#### 1.2 Label Encoding using 'for loop'

In [9]:
df2 = df.copy()
columns_to_encode = ['sex','smoker','region']

for col in columns_to_encode:
    df2[col] = enc.fit_transform(df2[[col]])
    
print(colored(f"Label Encoded DataFrame Using 'for loop':\n",color='blue',attrs = ['bold','dark']))
print(colored(df2.head(),color= 'light_yellow'))

[2m[1m[34mLabel Encoded DataFrame Using 'for loop':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


#### 1.3 Label Encoding using lambda function

In [10]:
df3 = df.copy()
columns_to_encode = ['sex','smoker','region']

df3[columns_to_encode] = df3[columns_to_encode].apply(lambda x: enc.fit_transform(x))

print(colored(f"Label Encoded DataFrame Using 'Lambda function':\n",color='blue',attrs = ['bold','dark']))
print(colored(df3.head(),color= 'light_yellow'))

[2m[1m[34mLabel Encoded DataFrame Using 'Lambda function':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


#### 1.4 Label Encoding by Label Mapping 

In [11]:
df_mapping = df.copy()
df_replace = df.copy()
df_comprehension = df.copy()

label_mapping = {
    'sex': {'female':1, 'male':4},
    'smoker': {'yes':0, 'no':1},
    'region': {'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4}
}

columns_to_encode = ['sex','smoker','region']

#using map function:
for col in columns_to_encode:
    df_mapping[col] = df_mapping[col].map(label_mapping[col])
    
#using replace method:
for col in columns_to_encode:
    df_replace[col] = df_replace[col].replace(label_mapping[col])
    
#using list comprehension:
for col in columns_to_encode:
    df_comprehension[col] = [label_mapping[col][x] for x in df_comprehension[col]]
    

print(colored(F"Label Encoded DataFrame using 'Map function':\n", color= 'blue', attrs= ['bold']))
print(colored(df_mapping.head(),color= 'light_yellow'))
print(colored(F"\nLabel Encoded DataFrame using 'Replace method':\n", color= 'blue', attrs= ['bold']))
print(colored(df_replace.head(),color = 'light_yellow'))
print(colored(F"\nLabel Encoded DataFrame using 'list comprehension':\n", color= 'blue', attrs= ['bold']))
print(colored(df_comprehension.head(), color = 'light_yellow'))

[1m[34mLabel Encoded DataFrame using 'Map function':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900         0       0       1  16884.92400
1   18    4  33.770         1       1       2   1725.55230
2   28    4  33.000         3       1       2   4449.46200
3   33    4  22.705         0       1       3  21984.47061
4   32    4  28.880         0       1       3   3866.85520[0m
[1m[34m
Label Encoded DataFrame using 'Replace method':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900         0       0       1  16884.92400
1   18    4  33.770         1       1       2   1725.55230
2   28    4  33.000         3       1       2   4449.46200
3   33    4  22.705         0       1       3  21984.47061
4   32    4  28.880         0       1       3   3866.85520[0m
[1m[34m
Label Encoded DataFrame using 'list comprehension':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900 

# 2. One Hot Encoder

In [12]:
enc2 = OneHotEncoder()

#### 2.1 Using dummy variables

In [13]:
df4 = df.copy()

columns_to_encode = ['sex','smoker','region']

#creating dummy variables
dummy = pd.get_dummies(df4[['sex','smoker','region']], columns = columns_to_encode, prefix = columns_to_encode)

#concatenate dummy variable with main df
encoded_df4 = pd.concat([df4.drop(['sex','smoker','region'],axis=1),dummy], axis=1)

encoded_df4.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


#### 2.2 Using OneHotEncoder() with 'For loop'

In [14]:
df5 = df.copy()
columns_to_encode = ['sex','smoker','region']

#Create a new DataFrame to store the encoded columns
df_hotcoded = pd.DataFrame()

for col in columns_to_encode:
    onehot = enc2.fit_transform(df5[[col]])
    feature_names = enc2.get_feature_names_out([col])
    onehot_df = pd.DataFrame(onehot.toarray(), columns=feature_names)
    df_hotcoded = pd.concat([df_hotcoded,onehot_df], axis=1)
    
encoded_df5 = pd.concat([df5.drop(['sex','smoker','region'],axis=1), df_hotcoded], axis=1)
encoded_df5.head(4)

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [15]:
encoded_df5_new = pd.concat([df5.drop(['sex','smoker','region'],axis=1), df_hotcoded], axis=1)
encoded_df5_new.head(4)

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


#### 2.2 Using OneHotEncoder() with 'Column Transformer'

In [16]:
from sklearn.compose import ColumnTransformer

df6 = df.copy()
columns_to_encode = ['sex','smoker','region']

# Create a ColumnTransformer
col_transformer = ColumnTransformer(
    [('hot', enc2, columns_to_encode)],
    remainder='passthrough')  # keep other columns unchanged

transformed_data = col_transformer.fit_transform(df6)

feature_names = col_transformer.get_feature_names_out()

encoded_df6 = pd.DataFrame(transformed_data,columns=feature_names)
encoded_df6.head(4)

Unnamed: 0,hot__sex_female,hot__sex_male,hot__smoker_no,hot__smoker_yes,hot__region_northeast,hot__region_northwest,hot__region_southeast,hot__region_southwest,remainder__age,remainder__bmi,remainder__children,remainder__charges
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19.0,27.9,0.0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18.0,33.77,1.0,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28.0,33.0,3.0,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33.0,22.705,0.0,21984.47061


In [17]:
#Rename the columns of above DataFrame:

# Get the feature names after one-hot encoding
feature_names2 = col_transformer.named_transformers_['hot'].get_feature_names_out(columns_to_encode)
#feature_names2 = col_transformer.named_transformers_['hot'].get_feature_names_out()

# Combine feature names with remaining column names
all_column_names = list(feature_names2) + list(df6.columns.drop(columns_to_encode))
#all_column_names = list(feature_names) + ['age', 'bmi', 'children', 'charges']

encoded_df6_new = pd.DataFrame(transformed_data,columns=all_column_names)
encoded_df6_new.head(4)

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children,charges
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19.0,27.9,0.0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18.0,33.77,1.0,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28.0,33.0,3.0,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33.0,22.705,0.0,21984.47061


# 3. Ordinal Encoder

In [18]:
ordinal = OrdinalEncoder()

In [19]:
df7 = df.copy()
columns_to_encode = ['sex','smoker','region']

#using 'for loop':
for col in columns_to_encode:
    df7[col] = ordinal.fit_transform(df7[[col]])
    
    
print(colored(f"Ordinal Encoded DataFrame Using 'for loop':\n",color='blue',attrs = ['bold','dark']))
print(colored(df7.head(),color= 'light_yellow'))

[2m[1m[34mOrdinal Encoded DataFrame Using 'for loop':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19  0.0  27.900         0     1.0     3.0  16884.92400
1   18  1.0  33.770         1     0.0     2.0   1725.55230
2   28  1.0  33.000         3     0.0     2.0   4449.46200
3   33  1.0  22.705         0     0.0     1.0  21984.47061
4   32  1.0  28.880         0     0.0     1.0   3866.85520[0m


## Three Encoder perform three different columns separately

In [20]:
df8 = df.copy()

#we want to perform => [('sex', LabelEncoder()), ('smoker', OneHotEncoder()), ('region', OrdinalEncoder())]

#Label Encoding
df8.sex = LabelEncoder().fit_transform(df8[['sex']])

#Ordinal Encoding
df8.region = OrdinalEncoder().fit_transform(df8[['region']])

#OneHot Encoding
dummy_var = pd.get_dummies(df8[['smoker']], prefix = 'somker')

encoder_df8 = pd.concat([df8.drop('smoker',axis = 1),dummy_var],axis =1)

print(colored('Hibrid Encoded DataFrame:\n', color = 'blue', attrs = ['bold','dark']))
print(colored(encoder_df8.head(),color = 'light_yellow'))

[2m[1m[34mHibrid Encoded DataFrame:
[0m
[93m   age  sex     bmi  children  region      charges  somker_no  somker_yes
0   19    0  27.900         0     3.0  16884.92400          0           1
1   18    1  33.770         1     2.0   1725.55230          1           0
2   28    1  33.000         3     2.0   4449.46200          1           0
3   33    1  22.705         0     1.0  21984.47061          1           0
4   32    1  28.880         0     1.0   3866.85520          1           0[0m
