In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# __Apply Several Preprocessing Method to Modeling at once Part 1a : Ridge__

__Table of Content__
1. data
1. preprocessing
1. data splitting
1. data transform
1. model fitting and evaluation
1. model properties

In [2]:
#Import library

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

> __1. DATA__

In [3]:
#Load dataset

tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


> __2. PREPROCESSING__

Preprocess scheme:
1. one hot encoding : sex, smoker, time
1. binary encoding : day
1. robust scaler : total_bill
1. no treatment : size

In [4]:
#Melakukan encoding sekaligus

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first'), 'sex smoker time'.split()),
    ('binary', ce.BinaryEncoder(), ['day']),
    ('robust', RobustScaler(), ['total_bill'])
], remainder = 'passthrough')

> __3. DATA SPLITTING__

__Train : Test = 70 : 30__

In [5]:
#Menentukan variabel dependent dan independent

X = tips.drop('tip', axis=1)
y = tips['tip']

In [6]:
#Melakukan data splitting

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

> __4. DATA TRANSFORM__ 

Here, we will do preprocessing step. Like we discuss previously. we apply .fit only to training set and .transform for both training set and test set.

In [7]:
#melihat isi dari x_train

X_train.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
58,11.24,Male,Yes,Sat,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
68,20.23,Male,No,Sat,Dinner,2
184,40.55,Male,Yes,Sun,Dinner,2


In [8]:
#1. Melakukan proses pada x train dan x test

X_train_preprocessed = transformer.fit_transform(X_train)
X_test_preprocessed = transformer.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [9]:
#2. Membuat hasil proses diatas menjadi df

X_trp_df = pd.DataFrame(X_train_preprocessed)
X_tsp_df = pd.DataFrame(X_test_preprocessed)
X_trp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.31617,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0


In [10]:
#Untuk melihat apa saja yg menjadi transform

transformer.transformers_

[('onehot', OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
 ('binary', BinaryEncoder(), ['day']),
 ('robust', RobustScaler(), ['total_bill']),
 ('remainder', 'passthrough', [5])]

In [11]:
#Untuk melihat feature di onehotencoder

transformer.transformers_[0][1].get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Lunch'], dtype=object)

In [12]:
#Untuk melihat feature di binary encoder

transformer.transformers_[1][1].get_feature_names()

['day_0', 'day_1', 'day_2']

In [13]:
#3. Mengubah nama kolom untuk hasil encoding

features = list(transformer.transformers_[0][1].get_feature_names()) + transformer.transformers_[1][1].get_feature_names() + ['total_bill scaled', 'size']

X_trp_df.columns = features
X_tsp_df.columns = features
X_trp_df.head()

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,total_bill scaled,size
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.31617,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0


In [14]:
X_tsp_df.head()

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,total_bill scaled,size
0,0.0,0.0,0.0,0.0,1.0,0.0,-0.117435,3.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.251129,2.0
2,1.0,1.0,0.0,0.0,0.0,1.0,-0.334237,2.0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.070461,4.0
4,1.0,1.0,0.0,0.0,0.0,1.0,-0.225836,2.0


> __5. MODEL FITTING & EVALUATION__

In [15]:
#1. Melakukan fitting model ridge

ridge = Ridge()
ridge.fit(X_trp_df, y_train) #fitting dgn x hasil transform

Ridge()

In [16]:
#2. Melakukan prediksi

y_pred = ridge.predict(X_tsp_df)
#kalo predict, nanti masukin formatnya sesuai x_tsp_df

In [17]:
#3. Model performance (mse, rmse) #disini tidak melihat over_under_fitting

print('mse', mean_squared_error(y_test, y_pred))
print('rmse', np.sqrt(mean_squared_error(y_test, y_pred)))

mse 1.0577456219830774
rmse 1.0284676086212328


> __6. MODEL PROPERTIES*__

In [18]:
#mengecek model properties berupa coef (optional)

coef_table = pd.DataFrame({'feature': X_trp_df.columns, 'coef': ridge.coef_})
coef_table

Unnamed: 0,feature,coef
0,x0_Male,-0.219943
1,x1_Yes,-0.071079
2,x2_Lunch,0.116938
3,day_0,0.031693
4,day_1,-0.00246
5,day_2,-0.061386
6,total_bill scaled,1.086144
7,size,0.17289
