## Preprocessing dan Feature Engineering

## Import Package

In [2]:
#Import library untuk data preparation dan visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import warnings for ignore the warnings
import warnings 
warnings.filterwarnings("ignore")

# import pickle and json file for columns and model file
import pickle
import json
import joblib

#Import package untuk balancing dataset
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## Read Data

In [3]:
#Read data X_train dan y_train hasil dari data preparation
X_train = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_train.pkl")
y_train = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_train.pkl")

#Read data X_valid dan y_valid hasil dari data preparation
X_valid = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_valid.pkl")
y_valid = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_valid.pkl")

#Read data X_test dan y_test hasil dari data preparation
X_test = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_test.pkl")
y_test = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_test.pkl")

In [4]:
#Menyatukan data X_train dan y_train untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
dataset = pd.concat([X_train, y_train], axis = 1)

#Menyatukan data X_valid dan y_valid untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
valid_set = pd.concat([X_valid, y_valid], axis = 1)

#Menyatukan data X_test dan y_test untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
test_set = pd.concat([X_test, y_test], axis = 1)

In [5]:
dataset

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Selling_Price
5,2018,9.83,2071,Diesel,Dealer,Manual,0,9.25
258,2015,13.60,25000,Petrol,Dealer,Manual,0,8.40
278,2016,8.40,4000,Petrol,Dealer,Manual,0,6.00
260,2016,13.60,29223,Petrol,Dealer,Manual,0,9.15
7,2015,8.61,33429,Diesel,Dealer,Manual,0,6.50
...,...,...,...,...,...,...,...,...
106,2014,3.45,16500,Petrol,Individual,Manual,1,1.35
83,2015,13.46,38000,Diesel,Dealer,Manual,0,12.50
17,2016,10.79,43000,Diesel,Dealer,Manual,0,7.75
230,2013,9.40,45000,Diesel,Dealer,Manual,0,6.15


## Handling Missing Value

In [6]:
from sklearn.impute import SimpleImputer

def imputeData(data, numerical_columns_mean, numerical_columns_median, categorical_columns):
    """
    Fungsi untuk melakukan imputasi data numerik dan kategorikal
    :param data: <pandas dataframe> sample data input
    :param numerical_columns_mean: <list> list kolom numerik data yang akan diimputasi dengan mean
    :param numerical_columns_median: <list> list kolom numerik data yang akan diimputasi dengan median
    :param categorical_columns: <list> list kolom kategorikal data
    :return numerical_data_imputed: <pandas dataframe> data numerik imputed
    :return categorical_data_imputed: <pandas dataframe> data kategorikal imputed
    """
    # Imputasi kolom numerik dengan mean
    numerical_data_mean = data[numerical_columns_mean]
    imputer_numerical_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer_numerical_mean.fit(numerical_data_mean)
    imputed_data_mean = imputer_numerical_mean.transform(numerical_data_mean)
    numerical_data_imputed_mean = pd.DataFrame(imputed_data_mean, columns=numerical_columns_mean, index=numerical_data_mean.index)

    # Imputasi kolom numerik dengan median
    numerical_data_median = data[numerical_columns_median]
    imputer_numerical_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_numerical_median.fit(numerical_data_median)
    imputed_data_median = imputer_numerical_median.transform(numerical_data_median)
    numerical_data_imputed_median = pd.DataFrame(imputed_data_median, columns=numerical_columns_median, index=numerical_data_median.index)

    # Gabungkan kedua data numerik yang telah diimputasi
    numerical_data_imputed = pd.concat([numerical_data_imputed_mean, numerical_data_imputed_median], axis=1)

    # Seleksi data kategorikal
    categorical_data = data[categorical_columns]

    # Imputasi dengan menggunakan modus
    mode = categorical_data.mode().iloc[0]

    # Lakukan imputasi untuk data kategorikal
    categorical_data_imputed = categorical_data.fillna(mode)

    # Gabungkan data numerik dan kategorikal yang telah diimputasi
    data_imputed = pd.concat([numerical_data_imputed, categorical_data_imputed], axis=1)

    return data_imputed

## Parameter Imputasi

In [7]:
# Buat kolom numerik
numerical_column = ["Kms_Driven", "Present_Price", "Owner"]
numerical_column_mean = ["Year"]
set_numerik = numerical_column + numerical_column_mean
dataset_column = list(X_train.columns)
categorical_column = list(set(dataset_column).difference(set(set_numerik)))

# Imputasi tabel dataset (train)

In [8]:
X_train_impute = imputeData(data = X_train, 
                            numerical_columns_mean = numerical_column_mean, 
                            numerical_columns_median = numerical_column, 
                            categorical_columns = categorical_column)

In [9]:
X_train_impute

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type,Transmission,Fuel_Type
5,2018.0,2071.0,9.83,0.0,Dealer,Manual,Diesel
258,2015.0,25000.0,13.60,0.0,Dealer,Manual,Petrol
278,2016.0,4000.0,8.40,0.0,Dealer,Manual,Petrol
260,2016.0,29223.0,13.60,0.0,Dealer,Manual,Petrol
7,2015.0,33429.0,8.61,0.0,Dealer,Manual,Diesel
...,...,...,...,...,...,...,...
106,2014.0,16500.0,3.45,1.0,Individual,Manual,Petrol
83,2015.0,38000.0,13.46,0.0,Dealer,Manual,Diesel
17,2016.0,43000.0,10.79,0.0,Dealer,Manual,Diesel
230,2013.0,45000.0,9.40,0.0,Dealer,Manual,Diesel


## Imputasi tabel valid_set

In [10]:
X_valid_impute = imputeData(data = X_valid, 
                            numerical_columns_mean = numerical_column_mean, 
                            numerical_columns_median = numerical_column, 
                            categorical_columns = categorical_column)

In [11]:
X_valid_impute

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type,Transmission,Fuel_Type
20,2016.0,25000.0,3.95,0.0,Dealer,Manual,Petrol
105,2015.0,21700.0,2.37,0.0,Individual,Manual,Petrol
174,2015.0,38600.0,0.72,0.0,Individual,Manual,Petrol
294,2014.0,33019.0,6.8,0.0,Dealer,Manual,Petrol
293,2010.0,38000.0,9.9,0.0,Dealer,Manual,Petrol
160,2011.0,24000.0,0.95,0.0,Individual,Manual,Petrol
19,2010.0,41442.0,7.98,0.0,Dealer,Manual,Petrol
102,2017.0,4000.0,1.78,0.0,Individual,Manual,Petrol
241,2015.0,35866.0,7.13,1.0,Dealer,Manual,Petrol
210,2012.0,35775.0,4.6,0.0,Dealer,Manual,Petrol


## Imputasi tabel test_set

In [12]:
X_test_impute = imputeData(data = X_test, 
                           numerical_columns_mean = numerical_column_mean, 
                           numerical_columns_median = numerical_column, 
                           categorical_columns = categorical_column)

In [13]:
X_test_impute

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type,Transmission,Fuel_Type
256,2016.0,49562.0,13.6,0.0,Dealer,Manual,Petrol
26,2013.0,55138.0,5.87,0.0,Dealer,Manual,Petrol
215,2012.0,36100.0,9.4,0.0,Dealer,Manual,Petrol
150,2011.0,6000.0,0.826,0.0,Individual,Manual,Petrol
148,2010.0,45000.0,0.94,0.0,Individual,Manual,Petrol
268,2017.0,19000.0,5.8,0.0,Dealer,Manual,Petrol
82,2017.0,15000.0,25.39,0.0,Dealer,Automatic,Diesel
195,2015.0,35000.0,0.32,0.0,Individual,Manual,Petrol
24,2013.0,56879.0,4.41,0.0,Dealer,Manual,Petrol
218,2014.0,45078.0,9.4,0.0,Dealer,Manual,Petrol


## Get Dummies

In [14]:
def get_dummies(train_df, input_df):
    # Menggabungkan data train dan input menjadi satu DataFrame
    combined_df = pd.concat([train_df, input_df])
    
    # Mengubah variabel kategorikal menjadi variabel dummy
    dummies_df = pd.get_dummies(combined_df, columns=train_df.select_dtypes(include='object').columns)
    
    # Memisahkan kembali data train dan input
    train_dummies = dummies_df[:train_df.shape[0]]
    input_dummies = dummies_df[train_df.shape[0]:]
    
    return train_dummies, input_dummies

In [15]:
dataset_ohe, valid_set = get_dummies(X_train_impute, X_valid_impute)

In [16]:
dataset, test_set = get_dummies(X_train_impute, X_test_impute)

In [17]:
train_set_ori = dataset[sorted(dataset.columns)]
test_set_ori = test_set[sorted(test_set.columns)]
valid_set_ori = valid_set[sorted(valid_set.columns)]

In [18]:
dataset

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
5,2018.0,2071.0,9.83,0.0,1,0,0,1,0,1,0
258,2015.0,25000.0,13.60,0.0,1,0,0,1,0,0,1
278,2016.0,4000.0,8.40,0.0,1,0,0,1,0,0,1
260,2016.0,29223.0,13.60,0.0,1,0,0,1,0,0,1
7,2015.0,33429.0,8.61,0.0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
106,2014.0,16500.0,3.45,1.0,0,1,0,1,0,0,1
83,2015.0,38000.0,13.46,0.0,1,0,0,1,0,1,0
17,2016.0,43000.0,10.79,0.0,1,0,0,1,0,1,0
230,2013.0,45000.0,9.40,0.0,1,0,0,1,0,1,0


In [19]:
test_set

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
256,2016.0,49562.0,13.6,0.0,1,0,0,1,0,0,1
26,2013.0,55138.0,5.87,0.0,1,0,0,1,0,0,1
215,2012.0,36100.0,9.4,0.0,1,0,0,1,0,0,1
150,2011.0,6000.0,0.826,0.0,0,1,0,1,0,0,1
148,2010.0,45000.0,0.94,0.0,0,1,0,1,0,0,1
268,2017.0,19000.0,5.8,0.0,1,0,0,1,0,0,1
82,2017.0,15000.0,25.39,0.0,1,0,1,0,0,1,0
195,2015.0,35000.0,0.32,0.0,0,1,0,1,0,0,1
24,2013.0,56879.0,4.41,0.0,1,0,0,1,0,0,1
218,2014.0,45078.0,9.4,0.0,1,0,0,1,0,0,1


In [20]:
valid_set

Unnamed: 0,Year,Kms_Driven,Present_Price,Owner,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
20,2016.0,25000.0,3.95,0.0,1,0,0,1,0,0,1
105,2015.0,21700.0,2.37,0.0,0,1,0,1,0,0,1
174,2015.0,38600.0,0.72,0.0,0,1,0,1,0,0,1
294,2014.0,33019.0,6.8,0.0,1,0,0,1,0,0,1
293,2010.0,38000.0,9.9,0.0,1,0,0,1,0,0,1
160,2011.0,24000.0,0.95,0.0,0,1,0,1,0,0,1
19,2010.0,41442.0,7.98,0.0,1,0,0,1,0,0,1
102,2017.0,4000.0,1.78,0.0,0,1,0,1,0,0,1
241,2015.0,35866.0,7.13,1.0,1,0,0,1,0,0,1
210,2012.0,35775.0,4.6,0.0,1,0,0,1,0,0,1


## Standar Scaler

In [21]:
from sklearn.preprocessing import StandardScaler
import pickle
import os

In [22]:
columns_to_scale = ["Year","Kms_Driven","Present_Price"]

def fit_scaler(train_data):
    scaler = StandardScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\scaler_1.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'scaler_1.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [23]:
scaler = fit_scaler(dataset)

In [24]:
scaling = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [25]:
X_train_scaler = transform_data(dataset, scaling)
X_valid_scaler = transform_data(valid_set, scaling)
X_test_scaler= transform_data(test_set, scaling)

In [26]:
X_train_scaler = X_train_scaler[sorted(X_train_scaler.columns)]
X_valid_scaler = X_valid_scaler[sorted(X_valid_scaler.columns)]
X_test_scaler = X_test_scaler[sorted(X_test_scaler.columns)]

In [27]:
X_train_scaler

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
5,0,1,0,-0.836543,0.0,0.215036,1,0,0,1,1.480186
258,0,0,1,-0.281227,0.0,0.620082,1,0,0,1,0.471987
278,0,0,1,-0.789824,0.0,0.061398,1,0,0,1,0.808053
260,0,0,1,-0.178951,0.0,0.620082,1,0,0,1,0.808053
7,0,1,0,-0.077086,0.0,0.083960,1,0,0,1,0.471987
...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,1,-0.487088,1.0,-0.470426,0,1,0,1,0.135920
83,0,1,0,0.033619,0.0,0.605040,1,0,0,1,0.471987
17,0,1,0,0.154713,0.0,0.318178,1,0,0,1,0.808053
230,0,1,0,0.203151,0.0,0.168837,1,0,0,1,-0.200146


In [28]:
X_valid_scaler

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
20,0,0,1,-0.281227,0.0,-0.416707,1,0,0,1,0.808053
105,0,0,1,-0.36115,0.0,-0.58646,0,1,0,1,0.471987
174,0,0,1,0.04815,0.0,-0.763735,0,1,0,1,0.471987
294,0,0,1,-0.087016,0.0,-0.110505,1,0,0,1,0.13592
293,0,0,1,0.033619,0.0,0.222557,1,0,0,1,-1.208346
160,0,0,1,-0.305446,0.0,-0.739024,0,1,0,1,-0.872279
19,0,0,1,0.11698,0.0,0.016273,1,0,0,1,-1.208346
102,0,0,1,-0.789824,0.0,-0.64985,0,1,0,1,1.14412
241,0,0,1,-0.018065,1.0,-0.07505,1,0,0,1,0.471987
210,0,0,1,-0.020269,0.0,-0.346871,1,0,0,1,-0.536213


In [29]:
X_test_scaler

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
256,0,0,1,0.313638,0.0,0.620082,1,0,0,1,0.808053
26,0,0,1,0.448682,0.0,-0.210423,1,0,0,1,-0.200146
215,0,0,1,-0.012397,0.0,0.168837,1,0,0,1,-0.536213
150,0,0,1,-0.741387,0.0,-0.752347,0,1,0,1,-0.872279
148,0,0,1,0.203151,0.0,-0.740098,0,1,0,1,-1.208346
268,0,0,1,-0.426541,0.0,-0.217944,1,0,0,1,1.14412
82,0,1,0,-0.523416,0.0,1.88679,1,0,1,0,1.14412
195,0,0,1,-0.039038,0.0,-0.806711,0,1,0,1,0.471987
24,0,0,1,0.490847,0.0,-0.367284,1,0,0,1,-0.200146
218,0,0,1,0.20504,0.0,0.168837,1,0,0,1,0.13592


## MinMaxScaler

In [30]:
from sklearn.preprocessing import MinMaxScaler
import pickle
import os

columns_to_scale =  ["Year","Kms_Driven","Present_Price"]

def fit_scaler(train_data):
    scaler = MinMaxScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\minmax_scaler_1.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'minmax_scaler_1.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [31]:
minmax_scaler = fit_scaler(dataset)
minmax_load = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [32]:
X_train_minmax = transform_data(dataset, minmax_load)
X_valid_minmax = transform_data(valid_set, minmax_load)
X_test_minmax = transform_data(test_set, minmax_load)

In [33]:
X_train_minmax = X_train_minmax[sorted(X_train_minmax.columns)]
X_valid_minmax = X_valid_minmax[sorted(X_valid_minmax.columns)]
X_test_minmax = X_test_minmax[sorted(X_test_minmax.columns)]

In [34]:
X_train_minmax

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
5,0,1,0,0.003145,0.0,0.101596,1,0,0,1,1.000000
258,0,0,1,0.049049,0.0,0.142516,1,0,0,1,0.800000
278,0,0,1,0.007007,0.0,0.086074,1,0,0,1,0.866667
260,0,0,1,0.057504,0.0,0.142516,1,0,0,1,0.866667
7,0,1,0,0.065924,0.0,0.088353,1,0,0,1,0.800000
...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,1,0.032032,1.0,0.032346,0,1,0,1,0.733333
83,0,1,0,0.075075,0.0,0.140996,1,0,0,1,0.800000
17,0,1,0,0.085085,0.0,0.112016,1,0,0,1,0.866667
230,0,1,0,0.089089,0.0,0.096928,1,0,0,1,0.666667


In [35]:
X_valid_minmax

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
20,0,0,1,0.049049,0.0,0.037773,1,0,0,1,0.866667
105,0,0,1,0.042442,0.0,0.020623,0,1,0,1,0.8
174,0,0,1,0.076276,0.0,0.002714,0,1,0,1,0.8
294,0,0,1,0.065103,0.0,0.068707,1,0,0,1,0.733333
293,0,0,1,0.075075,0.0,0.102355,1,0,0,1,0.466667
160,0,0,1,0.047047,0.0,0.00521,0,1,0,1,0.533333
19,0,0,1,0.081966,0.0,0.081515,1,0,0,1,0.466667
102,0,0,1,0.007007,0.0,0.014219,0,1,0,1,0.933333
241,0,0,1,0.070803,1.0,0.072289,1,0,0,1,0.8
210,0,0,1,0.070621,0.0,0.044828,1,0,0,1,0.6


In [36]:
X_test_minmax

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
256,0,0,1,0.098222,0.0,0.142516,1,0,0,1,0.866667
26,0,0,1,0.109385,0.0,0.058613,1,0,0,1,0.666667
215,0,0,1,0.071271,0.0,0.096928,1,0,0,1,0.6
150,0,0,1,0.011011,0.0,0.003864,0,1,0,1,0.533333
148,0,0,1,0.089089,0.0,0.005101,0,1,0,1,0.466667
268,0,0,1,0.037037,0.0,0.057853,1,0,0,1,0.933333
82,0,1,0,0.029029,0.0,0.270487,1,0,1,0,0.933333
195,0,0,1,0.069069,0.0,-0.001628,0,1,0,1,0.8
24,0,0,1,0.112871,0.0,0.042766,1,0,0,1,0.666667
218,0,0,1,0.089245,0.0,0.096928,1,0,0,1,0.733333


## RobustScaler

In [37]:
from sklearn.preprocessing import RobustScaler
import pickle
import os

columns_to_scale =  ["Year","Kms_Driven","Present_Price"]

def fit_scaler(train_data):
    scaler = RobustScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\robust_scaler_1.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'robust_scaler_1.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [38]:
robust_scaler = fit_scaler(dataset)
robust_load = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [39]:
X_train_robust = transform_data(dataset, robust_load)
X_valid_robust = transform_data(valid_set, robust_load)
X_test_robust = transform_data(test_set, robust_load)

In [40]:
X_train_robust = X_train_robust[sorted(X_train_robust.columns)]
X_valid_robust = X_valid_robust[sorted(X_valid_robust.columns)]
X_test_robust = X_test_robust[sorted(X_test_robust.columns)]

In [41]:
X_train_robust

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
5,0,1,0,-0.859354,0.0,0.434483,1,0,0,1,1.00
258,0,0,1,-0.153846,0.0,0.867816,1,0,0,1,0.25
278,0,0,1,-0.800000,0.0,0.270115,1,0,0,1,0.50
260,0,0,1,-0.023908,0.0,0.867816,1,0,0,1,0.50
7,0,1,0,0.105508,0.0,0.294253,1,0,0,1,0.25
...,...,...,...,...,...,...,...,...,...,...,...
106,0,0,1,-0.415385,1.0,-0.298851,0,1,0,1,0.00
83,0,1,0,0.246154,0.0,0.851724,1,0,0,1,0.25
17,0,1,0,0.400000,0.0,0.544828,1,0,0,1,0.50
230,0,1,0,0.461538,0.0,0.385057,1,0,0,1,-0.25


In [42]:
X_valid_robust

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
20,0,0,1,-0.153846,0.0,-0.241379,1,0,0,1,0.5
105,0,0,1,-0.255385,0.0,-0.422989,0,1,0,1,0.25
174,0,0,1,0.264615,0.0,-0.612644,0,1,0,1,0.25
294,0,0,1,0.092892,0.0,0.086207,1,0,0,1,0.0
293,0,0,1,0.246154,0.0,0.442529,1,0,0,1,-1.0
160,0,0,1,-0.184615,0.0,-0.586207,0,1,0,1,-0.75
19,0,0,1,0.352062,0.0,0.221839,1,0,0,1,-1.0
102,0,0,1,-0.8,0.0,-0.490805,0,1,0,1,0.75
241,0,0,1,0.180492,1.0,0.124138,1,0,0,1,0.25
210,0,0,1,0.177692,0.0,-0.166667,1,0,0,1,-0.5


In [43]:
X_test_robust

Unnamed: 0,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Kms_Driven,Owner,Present_Price,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual,Year
256,0,0,1,0.601908,0.0,0.867816,1,0,0,1,0.5
26,0,0,1,0.773477,0.0,-0.02069,1,0,0,1,-0.25
215,0,0,1,0.187692,0.0,0.385057,1,0,0,1,-0.5
150,0,0,1,-0.738462,0.0,-0.60046,0,1,0,1,-0.75
148,0,0,1,0.461538,0.0,-0.587356,0,1,0,1,-1.0
268,0,0,1,-0.338462,0.0,-0.028736,1,0,0,1,0.75
82,0,1,0,-0.461538,0.0,2.222989,1,0,1,0,0.75
195,0,0,1,0.153846,0.0,-0.658621,0,1,0,1,0.25
24,0,0,1,0.827046,0.0,-0.188506,1,0,0,1,-0.25
218,0,0,1,0.463938,0.0,0.385057,1,0,0,1,0.0


## Save Data

In [45]:
# Original Data
joblib.dump(train_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\train_set_ori.pkl")
joblib.dump(valid_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\valid_set_ori.pkl")
joblib.dump(test_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\test_set_ori.pkl")
joblib.dump(y_train, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_train.pkl")
joblib.dump(y_valid, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_valid.pkl")
joblib.dump(y_test, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_test.pkl")

# StandartScaler
joblib.dump(X_train_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_scaler.pkl")
joblib.dump(X_valid_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_scaler.pkl")
joblib.dump(X_test_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_scaler.pkl")

# MinMaxScaler
joblib.dump(X_train_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_minmax.pkl")
joblib.dump(X_valid_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_minmax.pkl")
joblib.dump(X_test_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_minmax.pkl")

# RobustScaler
joblib.dump(X_train_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_robust.pkl")
joblib.dump(X_valid_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_robust.pkl")
joblib.dump(X_test_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_robust.pkl")

['C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_robust.pkl']