In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from termcolor import colored

import warnings
warnings.filterwarnings('ignore')

In [5]:
df= pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
df.smoker.unique()

array(['yes', 'no'], dtype=object)

In [9]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [11]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [15]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Assignment
**1.Label Encoder**
**2. Ordinal Encoder**
**3. One Hot Encoder**

# 1.Label Encoder

In [31]:
le = LabelEncoder()

**1.1 Fundamental approach**

In [39]:
df1= df.copy()

df1.sex= le.fit_transform(df1[['sex']])
df1.region= le.fit_transform(df1[['region']])
df1.smoker= le.fit_transform(df1[['smoker']])

print(colored('LE using Fundamental Approach:\n',color='blue', attrs=['bold','dark']))
print(colored(df1.head(),color='light_yellow'))

[2m[1m[34mLE using Fundamental Approach:
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


**1.2 LE using 'for loop'**

In [44]:
df2= df.copy()
columns_to_encode = ['sex','smoker','region']

In [56]:
for col in columns_to_encode:
     df2[col]=le.fit_transform(df[[col]])

print(colored(f"LE using'for loop':\n",color='blue',attrs=['bold','dark']))
print(colored(df2.head(),color= 'light_yellow'))

[2m[1m[34mLE using'for loop':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


**1.3 LE using lambda Function**

In [59]:
df3= df.copy()
columns_to_encode = ['sex','smoker','region']

In [61]:
df3[columns_to_encode] = df3[columns_to_encode].apply(lambda x: le.fit_transform(x))

print(colored(f'LE using lambda Function:\n',color='blue',attrs=['bold','dark']))
print(colored(df3.head(),color='light_yellow'))

[2m[1m[34mLE using lambda Function:
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520[0m


**1.4 Label Encoding by Label Mapping**

In [94]:
df_mapping = df.copy()
df_replace = df.copy()
df_comprehension = df.copy()

label_mapping = {
    'sex': {'female':1, 'male': 2},
    'smoker': {'yes':0, 'no':1},
    'region': {'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4}
}

columns_to_encode = ['sex','smoker','region']

#map function:
for col in columns_to_encode:
    df_mapping[col] = df_mapping[col].map(label_mapping[col])

#replace method
for col in columns_to_encode:
    df_replace[col] = df_replace[col].replace(label_mapping[col])

#list comprehension
for col in columns_to_encode:
    df_comprehension[col] = [label_mapping[col][x] for x in df_comprehension[col]]


In [102]:
print(colored(f"LE using 'Map Function':\n",color='blue',attrs=['bold','dark']))
print(colored(df_mapping.head(),color='light_yellow'))

[2m[1m[34mLE using 'Map Function':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900         0       0       1  16884.92400
1   18    2  33.770         1       1       2   1725.55230
2   28    2  33.000         3       1       2   4449.46200
3   33    2  22.705         0       1       3  21984.47061
4   32    2  28.880         0       1       3   3866.85520[0m


In [100]:
print(colored(f"LE using 'replace method':\n", color='blue',attrs=['bold','dark']))
print(colored(df_replace.head(),color='light_yellow'))

[2m[1m[34mLE using 'replace method':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900         0       0       1  16884.92400
1   18    2  33.770         1       1       2   1725.55230
2   28    2  33.000         3       1       2   4449.46200
3   33    2  22.705         0       1       3  21984.47061
4   32    2  28.880         0       1       3   3866.85520[0m


In [98]:
print(colored(f"LE using 'list comprehension':\n", color='blue',attrs=['bold','dark']))
print(colored(df_comprehension.head(),color='light_yellow'))

[2m[1m[34mLE using 'list comprehension':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19    1  27.900         0       0       1  16884.92400
1   18    2  33.770         1       1       2   1725.55230
2   28    2  33.000         3       1       2   4449.46200
3   33    2  22.705         0       1       3  21984.47061
4   32    2  28.880         0       1       3   3866.85520[0m


# 3. Ordinal Encoding

In [112]:
ordinal=OrdinalEncoder()

In [118]:
df7=df.copy()
columns_to_encode = ['sex','smoker','region']

#using for loop

for col in columns_to_encode:
    df7[col]=ordinal.fit_transform(df[[col]])

print(colored(f"Ordinal Encoded using 'for loop':\n",color='blue',attrs=['bold','dark']))
print(colored(df7.head(),color='light_yellow'))

[2m[1m[34mOrdinal Encoded using 'for loop':
[0m
[93m   age  sex     bmi  children  smoker  region      charges
0   19  0.0  27.900         0     1.0     3.0  16884.92400
1   18  1.0  33.770         1     0.0     2.0   1725.55230
2   28  1.0  33.000         3     0.0     2.0   4449.46200
3   33  1.0  22.705         0     0.0     1.0  21984.47061
4   32  1.0  28.880         0     0.0     1.0   3866.85520[0m


# Three Encoder perform three different columns separately

In [132]:
df8= df.copy()

#Label Encoding
df8.sex = LabelEncoder().fit_transform(df8[['sex']])

#Ordinal Encoding
df8.region = OrdinalEncoder().fit_transform(df8[['region']])

#OneHot Encoding
dummy_var= pd.get_dummies(df8[['smoker']],prefix='smoker')

encoder_df8= pd.concat([df8.drop('smoker',axis=1),dummy_var],axis=1)

print(colored('Hydrid Encoded DataFrame:\n',color='blue',attrs=['bold','dark']))
print(colored(encoder_df8.head(),color='light_yellow'))

[2m[1m[34mHydrid Encoded DataFrame:
[0m
[93m   age  sex     bmi  children  region      charges  smoker_no  smoker_yes
0   19    0  27.900         0     3.0  16884.92400      False        True
1   18    1  33.770         1     2.0   1725.55230       True       False
2   28    1  33.000         3     2.0   4449.46200       True       False
3   33    1  22.705         0     1.0  21984.47061       True       False
4   32    1  28.880         0     1.0   3866.85520       True       False[0m
