In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
x = df.drop(columns= ['charges'])
y = df['charges']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 42)

In [12]:
x_train.shape

(1070, 6)

In [13]:
x_train_num = x_train.drop(columns= ['sex', 'smoker', 'region'])

In [7]:
lb = LabelEncoder()
x_train_region = lb.fit_transform(x_train['region']).reshape(-1, 1)
x_test_region = lb.fit_transform(x_test['region'])

x_train_region.shape

(1070, 1)

In [9]:
ode_sex = OrdinalEncoder(categories= [['female', 'male']])
x_train_sex = ode_sex.fit_transform(x_train[['sex']])
x_test_sex = ode_sex.fit_transform(x_train[['sex']])

ode_smoker = OrdinalEncoder(categories= [['yes', 'no']])
x_train_smoker = ode_smoker.fit_transform(x_train[['smoker']]).reshape(-1,1)
x_test_smoker = ode_smoker.fit_transform(x_train[['smoker']])

In [10]:
x_train_sex.shape, x_train_smoker.shape

((1070, 1), (1070, 1))

In [14]:
x_train_transformed = np.concatenate((x_train_region, x_train_sex, x_train_smoker, x_train_num), axis=1)
x_train_transformed.shape

(1070, 6)

## By using Column Transformer

In [19]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,female,19.950,2,no,northwest
1285,47,female,24.320,0,no,northeast
1142,52,female,24.860,0,no,southeast
969,39,female,34.320,5,no,southeast
486,54,female,21.470,3,no,northwest
...,...,...,...,...,...,...
1095,18,female,31.350,4,no,northeast
1130,39,female,23.870,5,no,southeast
1294,58,male,25.175,0,no,northeast
860,37,female,47.600,2,yes,southwest


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['southwest', 'southeast', 'northwest', 'northeast']]), ['region']),
    ('tnf2', OrdinalEncoder(categories=[['male', 'female']]), ['sex']),
    ('tnf3', OrdinalEncoder(categories=[['yes', 'no']]), ['smoker']) 
], remainder='passthrough')

encoded_data = transformer.fit_transform(x_train)

In [22]:
encoded_data.shape


(1070, 6)