## Preprocessing dan Feature Engineering

## Import Package

In [151]:
#Import library untuk data preparation dan visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import warnings for ignore the warnings
import warnings 
warnings.filterwarnings("ignore")

# import pickle and json file for columns and model file
import pickle
import json
import joblib

#Import package untuk balancing dataset
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## Read Data

In [152]:
#Read data X_train dan y_train hasil dari data preparation
X_train = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_train.pkl")
y_train = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_train.pkl")

#Read data X_valid dan y_valid hasil dari data preparation
X_valid = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_valid.pkl")
y_valid = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_valid.pkl")

#Read data X_test dan y_test hasil dari data preparation
X_test = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\X_test.pkl")
y_test = joblib.load("C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\2 - processed\\y_test.pkl")

In [153]:
#Menyatukan data X_train dan y_train untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
dataset = pd.concat([X_train, y_train], axis = 1)

#Menyatukan data X_valid dan y_valid untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
valid_set = pd.concat([X_valid, y_valid], axis = 1)

#Menyatukan data X_test dan y_test untuk dilakukan preprocessing dan Feature Engineering menjadi satu set
test_set = pd.concat([X_test, y_test], axis = 1)

In [154]:
dataset

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,selling_price
791,2016,12000,Petrol,Individual,Manual,First Owner,350000
2704,2014,50000,Diesel,Individual,Manual,First Owner,1800000
1266,2012,130000,Diesel,Individual,Manual,Second Owner,190000
239,2012,100000,Diesel,Individual,Manual,Second Owner,220000
2929,2002,70000,Petrol,Individual,Manual,Second Owner,80000
...,...,...,...,...,...,...,...
1593,2016,40000,Diesel,Individual,Manual,First Owner,1000000
4060,2014,100000,Diesel,Individual,Manual,Second Owner,600000
1346,2010,100000,Diesel,Individual,Manual,Second Owner,200000
3454,2017,110000,Diesel,Individual,Automatic,First Owner,710000


## Handling Missing Value

In [155]:
from sklearn.impute import SimpleImputer

def imputeData(data, numerical_columns_mean, numerical_columns_median, categorical_columns):
    """
    Fungsi untuk melakukan imputasi data numerik dan kategorikal
    :param data: <pandas dataframe> sample data input
    :param numerical_columns_mean: <list> list kolom numerik data yang akan diimputasi dengan mean
    :param numerical_columns_median: <list> list kolom numerik data yang akan diimputasi dengan median
    :param categorical_columns: <list> list kolom kategorikal data
    :return numerical_data_imputed: <pandas dataframe> data numerik imputed
    :return categorical_data_imputed: <pandas dataframe> data kategorikal imputed
    """
    # Imputasi kolom numerik dengan mean
    numerical_data_mean = data[numerical_columns_mean]
    imputer_numerical_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer_numerical_mean.fit(numerical_data_mean)
    imputed_data_mean = imputer_numerical_mean.transform(numerical_data_mean)
    numerical_data_imputed_mean = pd.DataFrame(imputed_data_mean, columns=numerical_columns_mean, index=numerical_data_mean.index)

    # Imputasi kolom numerik dengan median
    numerical_data_median = data[numerical_columns_median]
    imputer_numerical_median = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_numerical_median.fit(numerical_data_median)
    imputed_data_median = imputer_numerical_median.transform(numerical_data_median)
    numerical_data_imputed_median = pd.DataFrame(imputed_data_median, columns=numerical_columns_median, index=numerical_data_median.index)

    # Gabungkan kedua data numerik yang telah diimputasi
    numerical_data_imputed = pd.concat([numerical_data_imputed_mean, numerical_data_imputed_median], axis=1)

    # Seleksi data kategorikal
    categorical_data = data[categorical_columns]

    # Imputasi dengan menggunakan modus
    mode = categorical_data.mode().iloc[0]

    # Lakukan imputasi untuk data kategorikal
    categorical_data_imputed = categorical_data.fillna(mode)

    # Gabungkan data numerik dan kategorikal yang telah diimputasi
    data_imputed = pd.concat([numerical_data_imputed, categorical_data_imputed], axis=1)

    return data_imputed

## Parameter Imputasi

In [156]:
# Buat kolom numerik
numerical_column = ["km_driven"]
numerical_column_mean = ["year"]
set_numerik = numerical_column + numerical_column_mean
dataset_column = list(X_train.columns)
categorical_column = list(set(dataset_column).difference(set(set_numerik)))

# Imputasi tabel dataset (train)

In [157]:
X_train_impute = imputeData(data = X_train, 
                            numerical_columns_mean = numerical_column_mean, 
                            numerical_columns_median = numerical_column, 
                            categorical_columns = categorical_column)

In [158]:
X_train_impute

Unnamed: 0,year,km_driven,seller_type,owner,fuel,transmission
791,2016.0,12000.0,Individual,First Owner,Petrol,Manual
2704,2014.0,50000.0,Individual,First Owner,Diesel,Manual
1266,2012.0,130000.0,Individual,Second Owner,Diesel,Manual
239,2012.0,100000.0,Individual,Second Owner,Diesel,Manual
2929,2002.0,70000.0,Individual,Second Owner,Petrol,Manual
...,...,...,...,...,...,...
1593,2016.0,40000.0,Individual,First Owner,Diesel,Manual
4060,2014.0,100000.0,Individual,Second Owner,Diesel,Manual
1346,2010.0,100000.0,Individual,Second Owner,Diesel,Manual
3454,2017.0,110000.0,Individual,First Owner,Diesel,Automatic


## Imputasi tabel valid_set

In [159]:
X_valid_impute = imputeData(data = X_valid, 
                            numerical_columns_mean = numerical_column_mean, 
                            numerical_columns_median = numerical_column, 
                            categorical_columns = categorical_column)

In [160]:
X_valid_impute

Unnamed: 0,year,km_driven,seller_type,owner,fuel,transmission
3040,2006.0,75000.0,Individual,Second Owner,Diesel,Manual
748,2013.0,63654.0,Dealer,First Owner,Diesel,Manual
2593,2008.0,154000.0,Individual,Third Owner,Diesel,Manual
2638,2007.0,80000.0,Individual,Second Owner,Petrol,Manual
1325,2018.0,50000.0,Individual,First Owner,Diesel,Manual
...,...,...,...,...,...,...
1751,2016.0,31367.0,Dealer,First Owner,Diesel,Manual
1093,2017.0,73000.0,Dealer,First Owner,Diesel,Automatic
3266,2013.0,20000.0,Dealer,First Owner,Diesel,Manual
1123,2009.0,68000.0,Individual,Second Owner,Petrol,Manual


## Imputasi tabel test_set

In [161]:
X_test_impute = imputeData(data = X_test, 
                           numerical_columns_mean = numerical_column_mean, 
                           numerical_columns_median = numerical_column, 
                           categorical_columns = categorical_column)

In [162]:
X_test_impute

Unnamed: 0,year,km_driven,seller_type,owner,fuel,transmission
204,2017.0,40000.0,Individual,First Owner,Diesel,Automatic
3456,2017.0,20000.0,Individual,First Owner,Petrol,Manual
2389,2009.0,62000.0,Dealer,Second Owner,Petrol,Manual
3008,2017.0,67000.0,Individual,First Owner,Diesel,Manual
2305,2019.0,3700.0,Individual,First Owner,Petrol,Manual
...,...,...,...,...,...,...
814,2018.0,5000.0,Individual,First Owner,Petrol,Manual
1516,2020.0,25000.0,Dealer,First Owner,Diesel,Manual
1967,2018.0,38217.0,Individual,First Owner,Diesel,Manual
1651,2015.0,60000.0,Individual,First Owner,Petrol,Manual


## Get Dummies

In [163]:
def get_dummies(train_df, input_df):
    # Menggabungkan data train dan input menjadi satu DataFrame
    combined_df = pd.concat([train_df, input_df])
    
    # Mengubah variabel kategorikal menjadi variabel dummy
    dummies_df = pd.get_dummies(combined_df, columns=train_df.select_dtypes(include='object').columns)
    
    # Memisahkan kembali data train dan input
    train_dummies = dummies_df[:train_df.shape[0]]
    input_dummies = dummies_df[train_df.shape[0]:]
    
    return train_dummies, input_dummies

In [164]:
dataset_ohe, valid_set = get_dummies(X_train_impute, X_valid_impute)

In [165]:
dataset, test_set = get_dummies(X_train_impute, X_test_impute)

In [166]:
train_set_ori = dataset[sorted(dataset.columns)]
test_set_ori = test_set[sorted(test_set.columns)]
valid_set_ori = valid_set[sorted(valid_set.columns)]

In [167]:
dataset

Unnamed: 0,year,km_driven,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,transmission_Automatic,transmission_Manual
791,2016.0,12000.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1
2704,2014.0,50000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1
1266,2012.0,130000.0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
239,2012.0,100000.0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
2929,2002.0,70000.0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,2016.0,40000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1
4060,2014.0,100000.0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
1346,2010.0,100000.0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
3454,2017.0,110000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0


In [168]:
test_set

Unnamed: 0,year,km_driven,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,transmission_Automatic,transmission_Manual
204,2017.0,40000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0
3456,2017.0,20000.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1
2389,2009.0,62000.0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
3008,2017.0,67000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1
2305,2019.0,3700.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,2018.0,5000.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1
1516,2020.0,25000.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
1967,2018.0,38217.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1
1651,2015.0,60000.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1


In [169]:
valid_set

Unnamed: 0,year,km_driven,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,transmission_Automatic,transmission_Manual
3040,2006.0,75000.0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
748,2013.0,63654.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
2593,2008.0,154000.0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1
2638,2007.0,80000.0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
1325,2018.0,50000.0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,2016.0,31367.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
1093,2017.0,73000.0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
3266,2013.0,20000.0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
1123,2009.0,68000.0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1


## Standar Scaler

In [170]:
from sklearn.preprocessing import StandardScaler
import pickle
import os

In [171]:
columns_to_scale = ["km_driven","year"]

def fit_scaler(train_data):
    scaler = StandardScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'scaler.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [172]:
scaler = fit_scaler(dataset)

In [173]:
scaling = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [174]:
X_train_scaler = transform_data(dataset, scaling)
X_valid_scaler = transform_data(valid_set, scaling)
X_test_scaler= transform_data(test_set, scaling)

In [175]:
X_train_scaler = X_train_scaler[sorted(X_train_scaler.columns)]
X_valid_scaler = X_valid_scaler[sorted(X_valid_scaler.columns)]
X_test_scaler = X_test_scaler[sorted(X_test_scaler.columns)]

In [176]:
X_train_scaler

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
791,0,0,0,0,1,-1.139061,1,0,0,0,0,0,1,0,0,1,0.684492
2704,0,1,0,0,0,-0.344971,1,0,0,0,0,0,1,0,0,1,0.208637
1266,0,1,0,0,0,1.326799,0,0,1,0,0,0,1,0,0,1,-0.267218
239,0,1,0,0,0,0.699885,0,0,1,0,0,0,1,0,0,1,-0.267218
2929,0,0,0,0,1,0.072972,0,0,1,0,0,0,1,0,0,1,-2.646492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,0,1,0,0,0,-0.553942,1,0,0,0,0,0,1,0,0,1,0.684492
4060,0,1,0,0,0,0.699885,0,0,1,0,0,0,1,0,0,1,0.208637
1346,0,1,0,0,0,0.699885,0,0,1,0,0,0,1,0,0,1,-0.743073
3454,0,1,0,0,0,0.908857,1,0,0,0,0,0,1,0,1,0,0.922419


In [177]:
X_valid_scaler

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
3040,0,1,0,0,0,0.177457,0,0,1,0,0,0,1,0,0,1,-1.694782
748,0,1,0,0,0,-0.059641,1,0,0,0,0,1,0,0,0,1,-0.029291
2593,0,1,0,0,0,1.828330,0,0,0,0,1,0,1,0,0,1,-1.218928
2638,0,0,0,0,1,0.281943,0,0,1,0,0,0,1,0,0,1,-1.456855
1325,0,1,0,0,0,-0.344971,1,0,0,0,0,0,1,0,0,1,1.160346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,0,1,0,0,0,-0.734347,1,0,0,0,0,1,0,0,0,1,0.684492
1093,0,1,0,0,0,0.135663,1,0,0,0,0,1,0,0,1,0,0.922419
3266,0,1,0,0,0,-0.971884,1,0,0,0,0,1,0,0,0,1,-0.029291
1123,0,0,0,0,1,0.031178,0,0,1,0,0,0,1,0,0,1,-0.981000


In [178]:
X_test_scaler

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
204,0,1,0,0,0,-0.553942,1,0,0,0,0,0,1,0,1,0,0.922419
3456,0,0,0,0,1,-0.971884,1,0,0,0,0,0,1,0,0,1,0.922419
2389,0,0,0,0,1,-0.094205,0,0,1,0,0,1,0,0,0,1,-0.981000
3008,0,1,0,0,0,0.010280,1,0,0,0,0,0,1,0,0,1,0.922419
2305,0,0,0,0,1,-1.312507,1,0,0,0,0,0,1,0,0,1,1.398274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,0,0,0,0,1,-1.285341,1,0,0,0,0,0,1,0,0,1,1.160346
1516,0,1,0,0,0,-0.867399,1,0,0,0,0,1,0,0,0,1,1.636201
1967,0,1,0,0,0,-0.591201,1,0,0,0,0,0,1,0,0,1,1.160346
1651,0,0,0,0,1,-0.135999,1,0,0,0,0,0,1,0,0,1,0.446564


## MinMaxScaler

In [179]:
from sklearn.preprocessing import MinMaxScaler
import pickle
import os

columns_to_scale = ["km_driven"]

def fit_scaler(train_data):
    scaler = MinMaxScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\minmax_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'minmax_scaler.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [180]:
minmax_scaler = fit_scaler(dataset)
minmax_load = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [181]:
X_train_minmax = transform_data(dataset, minmax_load)
X_valid_minmax = transform_data(valid_set, minmax_load)
X_test_minmax = transform_data(test_set, minmax_load)

In [182]:
X_train_minmax = X_train_minmax[sorted(X_train_minmax.columns)]
X_valid_minmax = X_valid_minmax[sorted(X_valid_minmax.columns)]
X_test_minmax = X_test_minmax[sorted(X_test_minmax.columns)]

In [183]:
X_train_minmax

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
791,0,0,0,0,1,0.014876,1,0,0,0,0,0,1,0,0,1,0.684492
2704,0,1,0,0,0,0.061988,1,0,0,0,0,0,1,0,0,1,0.208637
1266,0,1,0,0,0,0.161170,0,0,1,0,0,0,1,0,0,1,-0.267218
239,0,1,0,0,0,0.123976,0,0,1,0,0,0,1,0,0,1,-0.267218
2929,0,0,0,0,1,0.086783,0,0,1,0,0,0,1,0,0,1,-2.646492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,0,1,0,0,0,0.049590,1,0,0,0,0,0,1,0,0,1,0.684492
4060,0,1,0,0,0,0.123976,0,0,1,0,0,0,1,0,0,1,0.208637
1346,0,1,0,0,0,0.123976,0,0,1,0,0,0,1,0,0,1,-0.743073
3454,0,1,0,0,0,0.136374,1,0,0,0,0,0,1,0,1,0,0.922419


In [184]:
X_valid_minmax

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
3040,0,1,0,0,0,0.092982,0,0,1,0,0,0,1,0,0,1,-1.694782
748,0,1,0,0,0,0.078915,1,0,0,0,0,1,0,0,0,1,-0.029291
2593,0,1,0,0,0,0.190924,0,0,0,0,1,0,1,0,0,1,-1.218928
2638,0,0,0,0,1,0.099181,0,0,1,0,0,0,1,0,0,1,-1.456855
1325,0,1,0,0,0,0.061988,1,0,0,0,0,0,1,0,0,1,1.160346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,0,1,0,0,0,0.038887,1,0,0,0,0,1,0,0,0,1,0.684492
1093,0,1,0,0,0,0.090502,1,0,0,0,0,1,0,0,1,0,0.922419
3266,0,1,0,0,0,0.024794,1,0,0,0,0,1,0,0,0,1,-0.029291
1123,0,0,0,0,1,0.084303,0,0,1,0,0,0,1,0,0,1,-0.981000


In [185]:
X_test_minmax

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
204,0,1,0,0,0,0.049590,1,0,0,0,0,0,1,0,1,0,0.922419
3456,0,0,0,0,1,0.024794,1,0,0,0,0,0,1,0,0,1,0.922419
2389,0,0,0,0,1,0.076865,0,0,1,0,0,1,0,0,0,1,-0.981000
3008,0,1,0,0,0,0.083064,1,0,0,0,0,0,1,0,0,1,0.922419
2305,0,0,0,0,1,0.004586,1,0,0,0,0,0,1,0,0,1,1.398274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,0,0,0,0,1,0.006198,1,0,0,0,0,0,1,0,0,1,1.160346
1516,0,1,0,0,0,0.030993,1,0,0,0,0,1,0,0,0,1,1.636201
1967,0,1,0,0,0,0.047379,1,0,0,0,0,0,1,0,0,1,1.160346
1651,0,0,0,0,1,0.074385,1,0,0,0,0,0,1,0,0,1,0.446564


## RobustScaler

In [186]:
from sklearn.preprocessing import RobustScaler
import pickle
import os

columns_to_scale = ["km_driven"]

def fit_scaler(train_data):
    scaler = RobustScaler()
    scaler.fit(train_data.loc[:, columns_to_scale])
    # save scaler
    with open('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\robust_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    return scaler

def load_scaler(folder_path):
    # load scaler
    file_path = os.path.join(folder_path, 'robust_scaler.pkl')
    with open(file_path, 'rb') as f:
        scaler = pickle.load(f)
    return scaler

def transform_data(data, scaler):
    scaled_data = scaler.transform(data.loc[:, columns_to_scale])
    data.loc[:, columns_to_scale] = scaled_data
    return data

In [187]:
robust_scaler = fit_scaler(dataset)
robust_load = load_scaler('C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\model\\5 - Model Final\\')

In [188]:
X_train_robust = transform_data(dataset, robust_load)
X_valid_robust = transform_data(valid_set, robust_load)
X_test_robust = transform_data(test_set, robust_load)

In [189]:
X_train_robust = X_train_robust[sorted(X_train_robust.columns)]
X_valid_robust = X_valid_robust[sorted(X_valid_robust.columns)]
X_test_robust = X_test_robust[sorted(X_test_robust.columns)]

In [190]:
X_train_robust

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
791,0,0,0,0,1,-0.872727,1,0,0,0,0,0,1,0,0,1,0.684492
2704,0,1,0,0,0,-0.181818,1,0,0,0,0,0,1,0,0,1,0.208637
1266,0,1,0,0,0,1.272727,0,0,1,0,0,0,1,0,0,1,-0.267218
239,0,1,0,0,0,0.727273,0,0,1,0,0,0,1,0,0,1,-0.267218
2929,0,0,0,0,1,0.181818,0,0,1,0,0,0,1,0,0,1,-2.646492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,0,1,0,0,0,-0.363636,1,0,0,0,0,0,1,0,0,1,0.684492
4060,0,1,0,0,0,0.727273,0,0,1,0,0,0,1,0,0,1,0.208637
1346,0,1,0,0,0,0.727273,0,0,1,0,0,0,1,0,0,1,-0.743073
3454,0,1,0,0,0,0.909091,1,0,0,0,0,0,1,0,1,0,0.922419


In [191]:
X_valid_robust

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
3040,0,1,0,0,0,0.272727,0,0,1,0,0,0,1,0,0,1,-1.694782
748,0,1,0,0,0,0.066436,1,0,0,0,0,1,0,0,0,1,-0.029291
2593,0,1,0,0,0,1.709091,0,0,0,0,1,0,1,0,0,1,-1.218928
2638,0,0,0,0,1,0.363636,0,0,1,0,0,0,1,0,0,1,-1.456855
1325,0,1,0,0,0,-0.181818,1,0,0,0,0,0,1,0,0,1,1.160346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,0,1,0,0,0,-0.520600,1,0,0,0,0,1,0,0,0,1,0.684492
1093,0,1,0,0,0,0.236364,1,0,0,0,0,1,0,0,1,0,0.922419
3266,0,1,0,0,0,-0.727273,1,0,0,0,0,1,0,0,0,1,-0.029291
1123,0,0,0,0,1,0.145455,0,0,1,0,0,0,1,0,0,1,-0.981000


In [192]:
X_test_robust

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,km_driven,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,year
204,0,1,0,0,0,-0.363636,1,0,0,0,0,0,1,0,1,0,0.922419
3456,0,0,0,0,1,-0.727273,1,0,0,0,0,0,1,0,0,1,0.922419
2389,0,0,0,0,1,0.036364,0,0,1,0,0,1,0,0,0,1,-0.981000
3008,0,1,0,0,0,0.127273,1,0,0,0,0,0,1,0,0,1,0.922419
2305,0,0,0,0,1,-1.023636,1,0,0,0,0,0,1,0,0,1,1.398274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814,0,0,0,0,1,-1.000000,1,0,0,0,0,0,1,0,0,1,1.160346
1516,0,1,0,0,0,-0.636364,1,0,0,0,0,1,0,0,0,1,1.636201
1967,0,1,0,0,0,-0.396055,1,0,0,0,0,0,1,0,0,1,1.160346
1651,0,0,0,0,1,0.000000,1,0,0,0,0,0,1,0,0,1,0.446564


## Save Data

In [193]:
# Original Data
joblib.dump(train_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\train_set_ori.pkl")
joblib.dump(valid_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\valid_set_ori.pkl")
joblib.dump(test_set_ori, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\test_set_ori.pkl")
joblib.dump(y_train, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_train.pkl")
joblib.dump(y_valid, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_valid.pkl")
joblib.dump(y_test, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\y_test.pkl")

# StandartScaler
joblib.dump(X_train_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_scaler.pkl")
joblib.dump(X_valid_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_scaler.pkl")
joblib.dump(X_test_scaler, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_scaler.pkl")

# MinMaxScaler
joblib.dump(X_train_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_minmax.pkl")
joblib.dump(X_valid_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_minmax.pkl")
joblib.dump(X_test_minmax, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_minmax.pkl")

# RobustScaler
joblib.dump(X_train_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_train_robust.pkl")
joblib.dump(X_valid_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_valid_robust.pkl")
joblib.dump(X_test_robust, "C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_robust.pkl")

['C:\\Users\\hp\\Portofolio Data Science\\4 - Vehicle Price Prediction\\dataset\\3 - final\\X_test_robust.pkl']