In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("../data/insurance.csv")

In [3]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [4]:
X=data.drop(labels=["expenses"],axis=1)

In [5]:
y=data[["expenses"]]

In [6]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest
1334,18,female,31.9,0,no,northeast
1335,18,female,36.9,0,no,southeast
1336,21,female,25.8,0,no,southwest


In [7]:
y

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86
...,...
1333,10600.55
1334,2205.98
1335,1629.83
1336,2007.95


In [8]:
categorical_cols=X.select_dtypes(include='object').columns

In [9]:
categorical_cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [10]:
numirical_cols=X.select_dtypes(exclude='object').columns

In [11]:
numirical_cols

Index(['age', 'bmi', 'children'], dtype='object')

In [12]:
X.select_dtypes(include='object')

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest
...,...,...,...
1333,male,no,northwest
1334,female,no,northeast
1335,female,no,southeast
1336,female,no,southwest


In [18]:
sex_map={"male":1,"female":0}
smoker_map={"yes":1,"no":0}
region_map={"southeast":1,"southwest":2,"northwest":3,"northeast":4}

In [19]:
data["sex"]=data["sex"].map(sex_map)
data["smoker"]=data["smoker"].map(smoker_map)
data["region"]=data["region"].map(region_map)

In [20]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,2,16884.92
1,18,1,33.8,1,0,1,1725.55
2,28,1,33.0,3,0,1,4449.46
3,33,1,22.7,0,0,3,21984.47
4,32,1,28.9,0,0,3,3866.86


In [21]:
from sklearn.impute import SimpleImputer ## Handling the Missing values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal Encoding

# Piplines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [22]:
num_pipeline=Pipeline(

    steps=[

        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())
    ]
)

In [23]:
cat_pipeline=Pipeline(

    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[sex_categories,smoker_categories,region_categories]))
    ]
)

In [24]:
preprocessor=ColumnTransformer(

    [
        ('num_pipeline',num_pipeline,numirical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [25]:
## Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [26]:
preprocessor.fit_transform(X_train)

array([[-1.26120616, -0.76009275, -0.92504639,  0.        ,  1.        ,
         3.        ],
       [-0.97833575,  0.49564145,  2.43745232,  0.        ,  1.        ,
         0.        ],
       [-0.62474772,  2.20800627,  0.75620297,  0.        ,  1.        ,
         0.        ],
       ...,
       [-0.69546533,  0.60979911, -0.92504639,  0.        ,  0.        ,
         1.        ],
       [ 0.78960436,  2.51786276, -0.08442171,  1.        ,  1.        ,
         0.        ],
       [ 1.56749801,  0.85442265, -0.92504639,  0.        ,  0.        ,
         0.        ]])

In [27]:
preprocessor.transform(X_test)

array([[ 0.78960436,  0.26732614, -0.08442171,  0.        ,  0.        ,
         3.        ],
       [-0.62474772,  0.12055202, -0.08442171,  0.        ,  1.        ,
         1.        ],
       [-0.2711597 , -0.58070215, -0.08442171,  0.        ,  1.        ,
         1.        ],
       ...,
       [-1.40264137, -1.62442927, -0.92504639,  0.        ,  1.        ,
         1.        ],
       [-0.05900689, -1.73858693,  0.75620297,  1.        ,  1.        ,
         3.        ],
       [ 1.21390999,  1.11535443,  0.75620297,  1.        ,  1.        ,
         0.        ]])