# Library / Packages

In [1]:
# basic
import pandas as pd
import numpy as np
from scipy.stats import mstats

# data preparation
from sklearn.impute import SimpleImputer 
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# data scoring
from sklearn.metrics import mean_squared_error, r2_score

# data tuning

# visualization
import matplotlib.pyplot as plt

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9}B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6}M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3}K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
# === Custom Transformer untuk Menghapus Outlier ===
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.bounds = {}

    def fit(self, X, y=None):
        # Hitung batas IQR untuk setiap fitur numerik
        Q1 = X.quantile(0.25)
        Q3 = X.quantile(0.75)
        IQR = Q3 - Q1
        self.bounds = {
            "lower": Q1 - self.factor * IQR,
            "upper": Q3 + self.factor * IQR,
        }
        return self

    def transform(self, X, y=None):
        mask = ~((X < self.bounds["lower"]) | (X > self.bounds["upper"])).any(axis=1)
        return X[mask], y[mask] if y is not None else None

# Read Dataset

In [4]:
# Memuat data train dan test
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [5]:
# show all column
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Train Dataset

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
train_df = train_df.drop('Id', axis = 1)
train_df.tail()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125
1459,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1Story,5,6,1965,1965,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,CBlock,TA,TA,No,BLQ,830,LwQ,290,136,1256,GasA,Gd,Y,SBrkr,1256,0,0,1256,1,0,1,1,3,1,TA,6,Typ,0,,Attchd,1965.0,Fin,1,276,TA,TA,Y,736,68,0,0,0,0,,,,0,6,2008,WD,Normal,147500


In [8]:
print(f'Total General Duplicated: {train_df.duplicated().sum()}')

Total General Duplicated: 0


In [9]:
# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(null_columns)

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [10]:
# Menampilkan informasi hanya kolom yang memiliki nilai null
null_columns = train_df.columns[train_df.isnull().sum() > 0]

# Menggunakan .info() untuk menampilkan tipe data dari kolom yang memiliki nilai null
train_df[null_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1201 non-null   float64
 1   Alley         91 non-null     object 
 2   MasVnrType    588 non-null    object 
 3   MasVnrArea    1452 non-null   float64
 4   BsmtQual      1423 non-null   object 
 5   BsmtCond      1423 non-null   object 
 6   BsmtExposure  1422 non-null   object 
 7   BsmtFinType1  1423 non-null   object 
 8   BsmtFinType2  1422 non-null   object 
 9   Electrical    1459 non-null   object 
 10  FireplaceQu   770 non-null    object 
 11  GarageType    1379 non-null   object 
 12  GarageYrBlt   1379 non-null   float64
 13  GarageFinish  1379 non-null   object 
 14  GarageQual    1379 non-null   object 
 15  GarageCond    1379 non-null   object 
 16  PoolQC        7 non-null      object 
 17  Fence         281 non-null    object 
 18  MiscFeature   54 non-null   

In [11]:
# Mengelompokkan kolom yang memiliki nilai null ke dalam float_col dan str_col
numeric_col = []
obj_col = []

for col in null_columns:
    if train_df[col].dtype in ['int', 'float']:
        numeric_col.append(col)
        
    elif train_df[col].dtype == 'object':
        obj_col.append(col)

print("Numeric Columns with Null Values:", numeric_col)
print("String Columns with Null Values:", obj_col)

Numeric Columns with Null Values: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
String Columns with Null Values: ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [12]:
# mempertahankan original columns
original_cols = train_df.columns

In [13]:
# Pipeline untuk numerik: imputasi nilai null dengan median
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy = "mean"))
])

# Pipeline untuk kategori: imputasi nilai null dengan modus
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
preprocessor_stage1 = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numeric_col),
        ("cat", categorical_pipeline, obj_col),
    ], remainder = "passthrough")

In [14]:
# before
print(len(train_df))

# Transform data menggunakan fit_transform pada tahap 1
train_df = preprocessor_stage1.fit_transform(train_df)

# implement original column
train_df = pd.DataFrame(train_df, columns = original_cols)

# after
print(len(train_df))

1460
1460


In [15]:
train_df.tail()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,62.0,0.0,1999.0,Grvl,BrkFace,Gd,TA,No,Unf,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Gd,MnPrv,Shed,60,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,TA,TA,PConc,0,0,953,953,GasA,Ex,Y,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,2,460,Y,0,40,0,0,0,0,0,8,2007,WD,Normal,175000
1456,85.0,119.0,1978.0,Grvl,Stone,Gd,TA,No,ALQ,Rec,SBrkr,TA,Attchd,Unf,TA,TA,Gd,MnPrv,Shed,20,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,TA,TA,CBlock,790,163,589,1542,GasA,TA,Y,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,2,500,Y,349,0,0,0,0,0,0,2,2010,WD,Normal,210000
1457,66.0,0.0,1941.0,Grvl,BrkFace,TA,Gd,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,Gd,GdPrv,Shed,70,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,Ex,Gd,Stone,275,0,877,1152,GasA,Ex,Y,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,1,252,Y,0,60,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,68.0,0.0,1950.0,Grvl,BrkFace,TA,TA,Mn,GLQ,Rec,FuseA,Gd,Attchd,Unf,TA,TA,Gd,MnPrv,Shed,20,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,TA,TA,CBlock,49,1029,0,1078,GasA,Gd,Y,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,1,240,Y,366,0,112,0,0,0,0,4,2010,WD,Normal,142125
1459,75.0,0.0,1965.0,Grvl,BrkFace,TA,TA,No,BLQ,LwQ,SBrkr,Gd,Attchd,Fin,TA,TA,Gd,MnPrv,Shed,20,RL,9937,Pave,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1Story,5,6,1965,1965,Gable,CompShg,HdBoard,HdBoard,Gd,TA,CBlock,830,290,136,1256,GasA,Gd,Y,1256,0,0,1256,1,0,1,1,3,1,TA,6,Typ,0,1,276,Y,736,68,0,0,0,0,0,6,2008,WD,Normal,147500


In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1460 non-null   object
 1   MSZoning       1460 non-null   object
 2   LotFrontage    1460 non-null   object
 3   LotArea        1460 non-null   object
 4   Street         1460 non-null   object
 5   Alley          1460 non-null   object
 6   LotShape       1460 non-null   object
 7   LandContour    1460 non-null   object
 8   Utilities      1460 non-null   object
 9   LotConfig      1460 non-null   object
 10  LandSlope      1460 non-null   object
 11  Neighborhood   1460 non-null   object
 12  Condition1     1460 non-null   object
 13  Condition2     1460 non-null   object
 14  BldgType       1460 non-null   object
 15  HouseStyle     1460 non-null   object
 16  OverallQual    1460 non-null   object
 17  OverallCond    1460 non-null   object
 18  YearBuilt      1460 non-null

In [19]:
numeric_col

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [18]:
train_df[numeric_col].astype('float64')

ValueError: could not convert string to float: 'AllPub'

In [None]:
train_df.info()

In [None]:
# Pastikan train_df adalah DataFrame
if not isinstance(train_df, pd.DataFrame):
    train_df = pd.DataFrame(train_df)

# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(null_columns)

## Test Dataset

In [None]:
test_df.info()

In [None]:
test_df = test_df.drop('Id', axis = 1)
test_df.tail()

In [None]:
print(f'Total General Duplicated: {test_df.duplicated().sum()}')

In [None]:
# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(null_columns)

In [None]:
# Menampilkan informasi hanya kolom yang memiliki nilai null
null_columns = test_df.columns[test_df.isnull().sum() > 0]

# Menggunakan .info() untuk menampilkan tipe data dari kolom yang memiliki nilai null
test_df[null_columns].info()

In [None]:
# Mengelompokkan kolom yang memiliki nilai null ke dalam float_col dan str_col
numeric_col = []
obj_col = []

for col in null_columns:
    if test_df[col].dtype in ['int', 'float']:
        numeric_col.append(col)
        
    elif test_df[col].dtype == 'object':
        obj_col.append(col)

print("Numeric Columns with Null Values:", numeric_col)
print("String Columns with Null Values:", obj_col)

In [None]:
# mempertahankan original columns
original_cols = test_df.columns

In [None]:
# Pipeline untuk numerik: imputasi nilai null dengan median
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean"))
])

# Pipeline untuk kategori: imputasi nilai null dengan modus
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
preprocessor_stage1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, numeric_col),
        ("cat", categorical_pipeline, obj_col),
    ], remainder = "passthrough")

In [None]:
# before
print(len(test_df))

# Transform data menggunakan fit_transform pada tahap 1
test_df = preprocessor_stage1.fit_transform(test_df)

# Pastikan header kolom tetap sama
test_df = pd.DataFrame(test_df, columns = original_cols)

# after
print(len(test_df))

In [None]:
test_df.tail()

In [None]:
# Pastikan test_df adalah DataFrame
if not isinstance(test_df, pd.DataFrame):
    test_df = pd.DataFrame(test_df)

# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(null_columns)

# Preparation

In [None]:
# Memisahkan kolom target dari data
target_col = 'SalePrice'

# Memastikan kolom target ada di dalam DataFrame sebelum mencoba memisahkannya
if target_col in train_df.columns:
    X_train = train_df.drop(columns = [target_col])
    y_train = train_df[target_col]

else:
    X_train = train_df  # Tidak memisahkan kolom target jika tidak ada
    y_train = None  # Set y_train ke None jika kolom target tidak ditemukan

if target_col in test_df.columns:
    X_test = test_df.drop(columns = [target_col])
    
else:
    X_test = test_df  # Tidak memisahkan kolom target jika tidak ada

In [None]:
# ENCODING

# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
                'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 
                'FireplaceQu', 'GarageQual', 'GarageCond'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
X_train_ordinal_encoding_cols = []
X_train_one_hot_encoding_cols = []
X_train_numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in X_train.columns:
    if X_train[col].dtype in ['int', 'float']:
        X_train_numeric_cols.append(col)

    elif test_df[col].dtype == 'object':
        if col in encoding_set:
            X_train_ordinal_encoding_cols.append(col)

        else:
            X_train_one_hot_encoding_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", X_train_ordinal_encoding_cols)
print("One-Hot Encoding Columns:", X_train_one_hot_encoding_cols)
print("Numeric Columns:", X_train_numeric_cols)

In [None]:
# Identifikasi kolom untuk setiap jenis encoding
numeric_cols = X_train_numeric_cols
ordinal_encoding_cols = X_train_ordinal_encoding_cols
categorical_cols = X_train_one_hot_encoding_cols

In [None]:
numeric_cols

In [None]:
# Irisan pada kedua dataset
ordinal_encoding_cols = list(set(ordinal_encoding_cols) & set(X_test.columns))
one_hot_encoding_cols = list(set(categorical_cols) & set(X_test.columns))
numeric_cols = list(set(numeric_cols) & set(X_test.columns))

In [None]:
ordinal_encoding_cols

In [None]:
sample = 

In [None]:
# Definisikan pipeline untuk setiap tipe fitur
numerical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))
])

categorical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [None]:
# Standarisasi fitur numerik dan one-hot encoding fitur kategorikal
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

preprocessor_stage2 = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_encoding_cols), 
        ("ord", ordinal_transformer, ordinal_encoding_cols)
    ], remainder = "drop")

# Modeling

In [None]:
# Membuat pipeline yang menggabungkan preprocessing dengan model
model_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor_stage2),
    ('regressor', LinearRegression())
])

In [None]:
# Definisikan parameter grid untuk GridSearchCV dengan beberapa model
param_grid = [
    {'regressor': [LinearRegression()]},
    {
        'regressor': [Ridge()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0], 
        'regressor__max_iter': [50000, 100000, 200000], 
        'regressor__tol': [1e-3, 1e-4, 1e-6] 
    },
    {
        'regressor': [Lasso()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0],
        'regressor__max_iter': [50000, 100000, 200000],
        'regressor__tol': [1e-3, 1e-4, 1e-6]
    },
]

In [None]:
X_train.head()

In [None]:
# Memeriksa apakah X_train tersedia dan sesuai dengan y_train
if 'X_train' in locals() and 'y_train' in locals():
    if y_train is not None and X_train.shape[0] == y_train.shape[0]:
        # Melakukan Grid Search dengan validasi silang
        grid_search = GridSearchCV(
            estimator=model_pipeline,
            param_grid=param_grid,
            cv=5,  # Jumlah lipatan untuk validasi silang
            scoring='neg_mean_squared_error',  # Metode evaluasi
            error_score=np.nan,  # Tangani kesalahan evaluasi
            verbose=1  # Tampilkan informasi selama proses
        )
        
        # Melatih model
        grid_search.fit(X_train, y_train)
        print("Grid search selesai. Model terbaik:", grid_search.best_params_)
    else:
        raise ValueError("Ukuran X_train dan y_train tidak cocok atau y_train tidak tersedia.")
else:
    raise NameError("X_train atau y_train belum didefinisikan.")

In [None]:
# Model terbaik dari Grid Search
best_model = grid_search.best_estimator_

# Prediksi harga rumah pada data testing menggunakan model terbaik
y_pred = best_model.predict(X_test)

In [None]:
# Menampilkan prediksi
print("Predicted prices:", y_pred)
print(f'Best parameters: {grid_search.best_params_}')