In [37]:
# importing the dependencies
import numpy as np
import pandas as pd
from seaborn import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression


In [38]:
# load the dataste
df = load_dataset('tips').drop(columns=['tip', 'sex']).sample(n=5, random_state=42)

In [39]:
df.iloc[[1,2,4],[2,4]] = np.nan

In [40]:
df.head()

Unnamed: 0,total_bill,smoker,day,time,size
24,19.82,No,Sat,Dinner,2.0
6,8.77,No,,Dinner,
153,24.55,No,,Dinner,
211,25.89,Yes,Sat,Dinner,4.0
198,13.0,Yes,,Lunch,


In [41]:
# partition the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(
    columns=['total_bill', 'size']), df['total_bill'], test_size=0.2, random_state=42)

In [42]:
# Fir pipeline to training data
pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse=False))
])

pipe.fit(X_train)

In [43]:
# Inspect training Data before and after
X_train

Unnamed: 0,smoker,day,time
198,Yes,,Lunch
153,No,,Dinner
24,No,Sat,Dinner
211,Yes,Sat,Dinner


In [44]:
pd.DataFrame(pipe.transform(X_train),
             columns=pipe['encoder'].get_feature_names_out(X_train.columns))

Unnamed: 0,smoker_No,smoker_Yes,day_Sat,day_missing,time_Dinner,time_Lunch
0,0.0,1.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0


In [45]:
# Inspect Test data before and after
X_test

Unnamed: 0,smoker,day,time
6,No,,Dinner


In [46]:
pd.DataFrame(pipe.transform(X_test),
             columns=pipe['encoder'].get_feature_names_out(X_train.columns))

Unnamed: 0,smoker_No,smoker_Yes,day_Sat,day_missing,time_Dinner,time_Lunch
0,1.0,0.0,0.0,1.0,1.0,0.0
