# Library / Packages

In [1]:
# basic
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde

# data preparation
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# data scoring
from sklearn.metrics import classification_report

# data tuning   
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
def filter_outliers_iqr(df, columns = None, threshold = 1.5):
    # Jika tidak ada kolom yang ditentukan, gunakan semua kolom numerik
    if columns is None:
        columns = df.select_dtypes(include = ["number"]).columns.tolist()
    
    # Salin DataFrame untuk memastikan tidak ada modifikasi langsung
    df_filtered = df.copy()
    
    for column in columns:
        # Hitung Q1, Q3, dan IQR
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        # Hitung batas bawah dan atas
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        # Hapus baris dengan outlier
        df_filtered = df_filtered[(df_filtered[column] >= lower_bound) & (df_filtered[column] <= upper_bound)]
    
    return df_filtered

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Dataset

In [5]:
# Memuat data train dan test
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [6]:
# show all column
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Train Dataset

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
# drop column
train_df = train_df.drop('Id', axis = 1)

# convert object if all numeric
train_df = convert_object_columns_to_numeric(train_df)

# check duplicate general data
print(f'Total General Duplicated: {train_df.duplicated().sum()} \n')
train_df.info()

Total General Duplicated: 0 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 no

In [9]:
# Menambahkan underscore di antara huruf kapital
train_df.columns = train_df.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2', regex = True)

# change into lowercase
train_df.columns = train_df.columns.str.lower()

# Memeriksa nama kolom setelah perubahan
list(train_df.columns)

['mssub_class',
 'mszoning',
 'lot_frontage',
 'lot_area',
 'street',
 'alley',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition1',
 'condition2',
 'bldg_type',
 'house_style',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod_add',
 'roof_style',
 'roof_matl',
 'exterior1st',
 'exterior2nd',
 'mas_vnr_type',
 'mas_vnr_area',
 'exter_qual',
 'exter_cond',
 'foundation',
 'bsmt_qual',
 'bsmt_cond',
 'bsmt_exposure',
 'bsmt_fin_type1',
 'bsmt_fin_sf1',
 'bsmt_fin_type2',
 'bsmt_fin_sf2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 'heating',
 'heating_qc',
 'central_air',
 'electrical',
 '1st_flr_sf',
 '2nd_flr_sf',
 'low_qual_fin_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'full_bath',
 'half_bath',
 'bedroom_abv_gr',
 'kitchen_abv_gr',
 'kitchen_qual',
 'tot_rms_abv_grd',
 'functional',
 'fireplaces',
 'fireplace_qu',
 'garage_type',
 'garage_yr_blt',
 'garage_finish',
 'garage_cars',
 'garage_area',
 'garage_qua

In [10]:
train_df.head()

Unnamed: 0,mssub_class,mszoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition1,condition2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod_add,roof_style,roof_matl,exterior1st,exterior2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmt_fin_type1,bsmt_fin_sf1,bsmt_fin_type2,bsmt_fin_sf2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abv_gr,kitchen_abv_gr,kitchen_qual,tot_rms_abv_grd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,sale_price
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [11]:
# null column
null_numeric = []
null_obj = []

# 
null_columns = train_df.columns[train_df.isnull().sum() > 0]

for col in null_columns:
    if train_df[col].dtype in ['int', 'float']:
        null_numeric.append(col)
        
    elif train_df[col].dtype == 'object':
        null_obj.append(col)

# 
print("Null Numeric:", null_numeric)
print("Null String:", null_obj)

Null Numeric: ['lot_frontage', 'mas_vnr_area', 'garage_yr_blt']
Null String: ['alley', 'mas_vnr_type', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'electrical', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature']


In [12]:
# 
num_cols = []
obj_cols = []

for col in train_df:
    if train_df[col].dtype in ['int', 'float']:
        num_cols.append(col)
        
    elif train_df[col].dtype == 'object':
        obj_cols.append(col)

# 
print("Numeric Cols:", num_cols)
print("String Cols:", obj_cols)

Numeric Cols: ['lot_frontage', 'mas_vnr_area', 'garage_yr_blt']
String Cols: ['mszoning', 'street', 'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition1', 'condition2', 'bldg_type', 'house_style', 'roof_style', 'roof_matl', 'exterior1st', 'exterior2nd', 'mas_vnr_type', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'heating', 'heating_qc', 'central_air', 'electrical', 'kitchen_qual', 'functional', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'paved_drive', 'pool_qc', 'fence', 'misc_feature', 'sale_type', 'sale_condition']


In [13]:
# Original columns
train_original = train_df.columns

# Numeric Pipeline
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean"))
])

# String Pipeline
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
prep_stage_1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, num_cols), 
        ("cat", categorical_pipeline, obj_cols), 
    ], 
    remainder = "passthrough", 
    verbose_feature_names_out = True)

In [14]:
# Transform data menggunakan fit_transform pada tahap 1
train_df = prep_stage_1.fit_transform(train_df)

# Columns After: ubah kembali ke DataFrame dengan kolom dari prep_stage_1
train_df = pd.DataFrame(train_df, columns = prep_stage_1.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in train_df.columns]
train_df.columns = clean_columns

In [15]:
# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns} \n')
train_df.info()

Total null columns: Series([], dtype: int64) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   lot_frontage     1460 non-null   object
 1   mas_vnr_area     1460 non-null   object
 2   garage_yr_blt    1460 non-null   object
 3   mszoning         1460 non-null   object
 4   street           1460 non-null   object
 5   alley            1460 non-null   object
 6   lot_shape        1460 non-null   object
 7   land_contour     1460 non-null   object
 8   utilities        1460 non-null   object
 9   lot_config       1460 non-null   object
 10  land_slope       1460 non-null   object
 11  neighborhood     1460 non-null   object
 12  condition1       1460 non-null   object
 13  condition2       1460 non-null   object
 14  bldg_type        1460 non-null   object
 15  house_style      1460 non-null   object
 16  roof_style       1460 non-null 

In [16]:
# change object after transform
train_df = convert_object_columns_to_numeric(train_df)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   lot_frontage     1460 non-null   float64
 1   mas_vnr_area     1460 non-null   float64
 2   garage_yr_blt    1460 non-null   float64
 3   mszoning         1460 non-null   object 
 4   street           1460 non-null   object 
 5   alley            1460 non-null   object 
 6   lot_shape        1460 non-null   object 
 7   land_contour     1460 non-null   object 
 8   utilities        1460 non-null   object 
 9   lot_config       1460 non-null   object 
 10  land_slope       1460 non-null   object 
 11  neighborhood     1460 non-null   object 
 12  condition1       1460 non-null   object 
 13  condition2       1460 non-null   object 
 14  bldg_type        1460 non-null   object 
 15  house_style      1460 non-null   object 
 16  roof_style       1460 non-null   object 
 17  roof_matl     

In [17]:
# Cetak jumlah baris sebelum filter
print(f"Total Rows Before Filtering: {len(train_df)}")

# Pilih kolom numerik
num_cols = train_df.select_dtypes(include = ["number"]).columns

# Terapkan filter pada kolom numerik
train_df = filter_outliers_iqr(train_df, columns = num_cols)

# Cetak jumlah baris setelah filter
print(f"Total Rows After Filtering: {len(train_df)}")

Total Rows Before Filtering: 1460
Total Rows After Filtering: 557


In [18]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'overall_qual', 'overall_cond', 'exter_qual', 'exter_cond', 
                'bsmt_qual', 'Bsmt_cond', 'heating_qc', 'kitchen_qual', 
                'fireplace_qu', 'garage_qual', 'garage_cond'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
train_ordinal_cols = []
train_one_hot_cols = []
train_numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in train_df.columns:
    if train_df[col].dtype in ['int', 'float']:
        train_numeric_cols.append(col)

    elif train_df[col].dtype == 'object':
        if col in encoding_set:
            train_ordinal_cols.append(col)

        else:
            train_one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", train_ordinal_cols)
print("One-Hot Encoding Columns:", train_one_hot_cols)
print("Numeric Columns:", train_numeric_cols)

Ordinal Encoding Columns: ['exter_qual', 'exter_cond', 'bsmt_qual', 'heating_qc', 'kitchen_qual', 'fireplace_qu', 'garage_qual', 'garage_cond']
One-Hot Encoding Columns: ['mszoning', 'street', 'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition1', 'condition2', 'bldg_type', 'house_style', 'roof_style', 'roof_matl', 'exterior1st', 'exterior2nd', 'mas_vnr_type', 'foundation', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'heating', 'central_air', 'electrical', 'functional', 'garage_type', 'garage_finish', 'paved_drive', 'pool_qc', 'fence', 'misc_feature', 'sale_type', 'sale_condition']
Numeric Columns: ['lot_frontage', 'mas_vnr_area', 'garage_yr_blt', 'mssub_class', 'lot_area', 'overall_qual', 'overall_cond', 'year_built', 'year_remod_add', 'bsmt_fin_sf1', 'bsmt_fin_sf2', 'bsmt_unf_sf', 'total_bsmt_sf', '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath', 'h

## Test Dataset

In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [20]:
# drop column
test_df = test_df.drop('Id', axis = 1)

# convert object if all numeric
test_df = convert_object_columns_to_numeric(test_df)

# check duplicate general data
print(f'Total General Duplicated: {test_df.duplicated().sum()} \n')
test_df.info()

Total General Duplicated: 0 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 no

In [21]:
# Menambahkan underscore di antara huruf kapital
test_df.columns = test_df.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2', regex = True)

# change into lowercase
test_df.columns = test_df.columns.str.lower()

# Memeriksa nama kolom setelah perubahan
list(test_df.columns)

['mssub_class',
 'mszoning',
 'lot_frontage',
 'lot_area',
 'street',
 'alley',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition1',
 'condition2',
 'bldg_type',
 'house_style',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod_add',
 'roof_style',
 'roof_matl',
 'exterior1st',
 'exterior2nd',
 'mas_vnr_type',
 'mas_vnr_area',
 'exter_qual',
 'exter_cond',
 'foundation',
 'bsmt_qual',
 'bsmt_cond',
 'bsmt_exposure',
 'bsmt_fin_type1',
 'bsmt_fin_sf1',
 'bsmt_fin_type2',
 'bsmt_fin_sf2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 'heating',
 'heating_qc',
 'central_air',
 'electrical',
 '1st_flr_sf',
 '2nd_flr_sf',
 'low_qual_fin_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'full_bath',
 'half_bath',
 'bedroom_abv_gr',
 'kitchen_abv_gr',
 'kitchen_qual',
 'tot_rms_abv_grd',
 'functional',
 'fireplaces',
 'fireplace_qu',
 'garage_type',
 'garage_yr_blt',
 'garage_finish',
 'garage_cars',
 'garage_area',
 'garage_qua

In [22]:
# null column
null_numeric = []
null_obj = []

# 
null_columns = test_df.columns[test_df.isnull().sum() > 0]

for col in null_columns:
    if test_df[col].dtype in ['int', 'float']:
        null_numeric.append(col)
        
    elif test_df[col].dtype == 'object':
        null_obj.append(col)

# 
print("Null Numeric:", null_numeric)
print("Null String:", null_obj)

Null Numeric: ['lot_frontage', 'mas_vnr_area', 'bsmt_fin_sf1', 'bsmt_fin_sf2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath', 'garage_yr_blt', 'garage_cars', 'garage_area']
Null String: ['mszoning', 'alley', 'utilities', 'exterior1st', 'exterior2nd', 'mas_vnr_type', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'kitchen_qual', 'functional', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature', 'sale_type']


In [23]:
# 
num_cols = []
obj_cols = []

for col in test_df:
    if test_df[col].dtype in ['int', 'float']:
        num_cols.append(col)
        
    elif test_df[col].dtype == 'object':
        obj_cols.append(col)

# 
print("Numeric Cols:", num_cols)
print("String Cols:", obj_cols)

Numeric Cols: ['lot_frontage', 'mas_vnr_area', 'bsmt_fin_sf1', 'bsmt_fin_sf2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath', 'garage_yr_blt', 'garage_cars', 'garage_area']
String Cols: ['mszoning', 'street', 'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition1', 'condition2', 'bldg_type', 'house_style', 'roof_style', 'roof_matl', 'exterior1st', 'exterior2nd', 'mas_vnr_type', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'heating', 'heating_qc', 'central_air', 'electrical', 'kitchen_qual', 'functional', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'paved_drive', 'pool_qc', 'fence', 'misc_feature', 'sale_type', 'sale_condition']


In [24]:
# Original columns
test_original = test_df.columns

# Numeric Pipeline
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean"))
])

# String Pipeline
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
prep_stage_1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, num_cols), 
        ("cat", categorical_pipeline, obj_cols), 
    ], 
    remainder = "passthrough", 
    verbose_feature_names_out = True)

In [25]:
# Transform data menggunakan fit_transform pada tahap 1
test_df = prep_stage_1.fit_transform(test_df)

# Columns After: ubah kembali ke DataFrame dengan kolom dari prep_stage_1
test_df = pd.DataFrame(test_df, columns = prep_stage_1.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in test_df.columns]
test_df.columns = clean_columns

In [26]:
# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns} \n')
test_df.info()

Total null columns: Series([], dtype: int64) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   lot_frontage     1459 non-null   object
 1   mas_vnr_area     1459 non-null   object
 2   bsmt_fin_sf1     1459 non-null   object
 3   bsmt_fin_sf2     1459 non-null   object
 4   bsmt_unf_sf      1459 non-null   object
 5   total_bsmt_sf    1459 non-null   object
 6   bsmt_full_bath   1459 non-null   object
 7   bsmt_half_bath   1459 non-null   object
 8   garage_yr_blt    1459 non-null   object
 9   garage_cars      1459 non-null   object
 10  garage_area      1459 non-null   object
 11  mszoning         1459 non-null   object
 12  street           1459 non-null   object
 13  alley            1459 non-null   object
 14  lot_shape        1459 non-null   object
 15  land_contour     1459 non-null   object
 16  utilities        1459 non-null 

In [27]:
# change object after transform
test_df = convert_object_columns_to_numeric(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   lot_frontage     1459 non-null   float64
 1   mas_vnr_area     1459 non-null   float64
 2   bsmt_fin_sf1     1459 non-null   float64
 3   bsmt_fin_sf2     1459 non-null   float64
 4   bsmt_unf_sf      1459 non-null   float64
 5   total_bsmt_sf    1459 non-null   float64
 6   bsmt_full_bath   1459 non-null   float64
 7   bsmt_half_bath   1459 non-null   float64
 8   garage_yr_blt    1459 non-null   float64
 9   garage_cars      1459 non-null   float64
 10  garage_area      1459 non-null   float64
 11  mszoning         1459 non-null   object 
 12  street           1459 non-null   object 
 13  alley            1459 non-null   object 
 14  lot_shape        1459 non-null   object 
 15  land_contour     1459 non-null   object 
 16  utilities        1459 non-null   object 
 17  lot_config    

In [28]:
# Cetak jumlah baris sebelum filter
print(f"Total Rows Before Filtering: {len(test_df)}")

# Pilih kolom numerik
num_cols = test_df.select_dtypes(include = ["number"]).columns

# Terapkan filter pada kolom numerik
test_df = filter_outliers_iqr(test_df, columns = num_cols)

# Cetak jumlah baris setelah filter
print(f"Total Rows After Filtering: {len(test_df)}")

Total Rows Before Filtering: 1459
Total Rows After Filtering: 543


In [29]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'overall_qual', 'overall_cond', 'exter_qual', 'exter_cond', 
                'bsmt_qual', 'Bsmt_cond', 'heating_qc', 'kitchen_qual', 
                'fireplace_qu', 'garage_qual', 'garage_cond'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
test_ordinal_cols = []
test_one_hot_cols = []
test_numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in test_df.columns:
    if test_df[col].dtype in ['int', 'float']:
        test_numeric_cols.append(col)

    elif test_df[col].dtype == 'object':
        if col in encoding_set:
            test_ordinal_cols.append(col)

        else:
            test_one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", test_ordinal_cols)
print("One-Hot Encoding Columns:", test_one_hot_cols)
print("Numeric Columns:", test_numeric_cols)

Ordinal Encoding Columns: ['exter_qual', 'exter_cond', 'bsmt_qual', 'heating_qc', 'kitchen_qual', 'fireplace_qu', 'garage_qual', 'garage_cond']
One-Hot Encoding Columns: ['mszoning', 'street', 'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition1', 'condition2', 'bldg_type', 'house_style', 'roof_style', 'roof_matl', 'exterior1st', 'exterior2nd', 'mas_vnr_type', 'foundation', 'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type1', 'bsmt_fin_type2', 'heating', 'central_air', 'electrical', 'functional', 'garage_type', 'garage_finish', 'paved_drive', 'pool_qc', 'fence', 'misc_feature', 'sale_type', 'sale_condition']
Numeric Columns: ['lot_frontage', 'mas_vnr_area', 'bsmt_fin_sf1', 'bsmt_fin_sf2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath', 'garage_yr_blt', 'garage_cars', 'garage_area', 'mssub_class', 'lot_area', 'overall_qual', 'overall_cond', 'year_built', 'year_remod_add', '1st_flr_sf', '2nd_flr_sf', 'low_qual_fin_sf', 

## Split Set

In [30]:
# Identifikasi kolom-kolom yang ada di train dan test
ordinal_encoding_cols = list(set(train_ordinal_cols) & set(test_ordinal_cols))
one_hot_encoding_cols = list(set(train_one_hot_cols) & set(test_one_hot_cols))
numeric_cols = list(set(train_numeric_cols) & set(test_numeric_cols))

# 
print(f'ordinal cols: {ordinal_encoding_cols}')
print(f'one-hot cols: {one_hot_encoding_cols}')
print(f'numeric cols: {numeric_cols}')

ordinal cols: ['garage_cond', 'fireplace_qu', 'heating_qc', 'exter_cond', 'garage_qual', 'exter_qual', 'bsmt_qual', 'kitchen_qual']
one-hot cols: ['sale_type', 'lot_config', 'roof_matl', 'alley', 'pool_qc', 'paved_drive', 'bsmt_cond', 'roof_style', 'exterior1st', 'land_contour', 'foundation', 'utilities', 'mas_vnr_type', 'lot_shape', 'bsmt_fin_type2', 'bsmt_fin_type1', 'central_air', 'street', 'condition1', 'condition2', 'land_slope', 'garage_type', 'heating', 'garage_finish', 'misc_feature', 'house_style', 'bsmt_exposure', 'bldg_type', 'fence', 'electrical', 'neighborhood', 'exterior2nd', 'mszoning', 'sale_condition', 'functional']
numeric cols: ['open_porch_sf', 'misc_val', 'garage_yr_blt', '2nd_flr_sf', 'lot_frontage', 'year_built', 'overall_qual', '3ssn_porch', 'pool_area', 'full_bath', 'half_bath', 'overall_cond', 'yr_sold', 'total_bsmt_sf', 'bedroom_abv_gr', '1st_flr_sf', 'bsmt_full_bath', 'garage_area', 'wood_deck_sf', 'kitchen_abv_gr', 'bsmt_fin_sf2', 'lot_area', 'low_qual_fin_

In [31]:
# transform
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

prep_stage_2 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_encoding_cols), 
        ("ord", ordinal_transformer, ordinal_encoding_cols)
    ], remainder = "passthrough")

### Train Data

In [32]:
# Transform data
transformed_data = prep_stage_2.fit_transform(train_df)

# Membuat DataFrame dengan kolom hasil transformasi
# ====================================================
# Mendapatkan nama kolom baru untuk OneHotEncoder
categorical_feature_names = prep_stage_2.named_transformers_["cat"].get_feature_names_out(one_hot_encoding_cols)

# Gabungkan semua nama kolom
all_columns = (
    numeric_cols +
    list(categorical_feature_names) +
    ordinal_encoding_cols +
    list(train_df.columns.difference(numeric_cols + one_hot_encoding_cols + ordinal_encoding_cols))
)

# Membuat DataFrame dengan nama kolom yang sesuai
train_df = pd.DataFrame(transformed_data, columns = all_columns)

# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(f'Train Stage 2 Check: {null_columns}')
train_df.head(3)

Train Stage 2 Check: Series([], dtype: int64)


Unnamed: 0,open_porch_sf,misc_val,garage_yr_blt,2nd_flr_sf,lot_frontage,year_built,overall_qual,3ssn_porch,pool_area,full_bath,half_bath,overall_cond,yr_sold,total_bsmt_sf,bedroom_abv_gr,1st_flr_sf,bsmt_full_bath,garage_area,wood_deck_sf,kitchen_abv_gr,bsmt_fin_sf2,lot_area,low_qual_fin_sf,screen_porch,garage_cars,mssub_class,mas_vnr_area,bsmt_fin_sf1,bsmt_unf_sf,enclosed_porch,year_remod_add,gr_liv_area,tot_rms_abv_grd,bsmt_half_bath,mo_sold,fireplaces,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,lot_config_Corner,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside,roof_matl_CompShg,alley_Grvl,alley_Pave,pool_qc_Gd,paved_drive_N,paved_drive_P,paved_drive_Y,bsmt_cond_Fa,bsmt_cond_Gd,bsmt_cond_TA,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,exterior1st_AsbShng,exterior1st_BrkFace,exterior1st_CemntBd,exterior1st_HdBoard,exterior1st_MetalSd,exterior1st_Plywood,exterior1st_Stucco,exterior1st_VinylSd,exterior1st_Wd Sdng,exterior1st_WdShing,land_contour_Bnk,land_contour_HLS,land_contour_Low,land_contour_Lvl,foundation_BrkTil,foundation_CBlock,foundation_PConc,foundation_Wood,utilities_AllPub,mas_vnr_type_BrkCmn,mas_vnr_type_BrkFace,mas_vnr_type_Stone,lot_shape_IR1,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,bsmt_fin_type2_Unf,bsmt_fin_type1_ALQ,bsmt_fin_type1_BLQ,bsmt_fin_type1_GLQ,bsmt_fin_type1_LwQ,bsmt_fin_type1_Rec,bsmt_fin_type1_Unf,central_air_N,central_air_Y,street_Grvl,street_Pave,condition1_Artery,condition1_Feedr,condition1_Norm,condition1_PosN,condition1_RRAe,condition1_RRAn,condition1_RRNe,condition2_Norm,land_slope_Gtl,land_slope_Mod,garage_type_Attchd,garage_type_Basment,garage_type_BuiltIn,garage_type_Detchd,heating_GasA,heating_GasW,heating_Grav,garage_finish_Fin,garage_finish_RFn,garage_finish_Unf,misc_feature_Shed,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2Story,house_style_SFoyer,house_style_SLvl,bsmt_exposure_Av,bsmt_exposure_Gd,bsmt_exposure_Mn,bsmt_exposure_No,bldg_type_1Fam,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE,fence_GdPrv,fence_GdWo,fence_MnPrv,fence_MnWw,electrical_FuseA,electrical_FuseF,electrical_SBrkr,neighborhood_Blmngtn,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_IDOTRR,neighborhood_MeadowV,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NPkVill,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,exterior2nd_AsbShng,exterior2nd_BrkFace,exterior2nd_CmentBd,exterior2nd_HdBoard,exterior2nd_ImStucc,exterior2nd_MetalSd,exterior2nd_Plywood,exterior2nd_Stone,exterior2nd_Stucco,exterior2nd_VinylSd,exterior2nd_Wd Sdng,exterior2nd_Wd Shng,mszoning_FV,mszoning_RH,mszoning_RL,mszoning_RM,sale_condition_Abnorml,sale_condition_AdjLand,sale_condition_Family,sale_condition_Normal,sale_condition_Partial,functional_Maj1,functional_Maj2,functional_Min1,functional_Min2,functional_Typ,garage_cond,fireplace_qu,heating_qc,exter_cond,garage_qual,exter_qual,bsmt_qual,kitchen_qual,sale_price
0,0.439408,0.0,0.774046,1.314747,-0.28003,0.784193,0.61452,0.0,0.0,0.787362,1.266019,-0.523677,0.161764,-0.697482,0.294737,-0.901932,1.157781,0.383002,-0.923836,0.0,0.0,-0.284244,0.0,0.0,0.273711,0.356986,1.177146,0.639427,-1.103434,0.0,0.704506,0.725374,1.291657,0.0,-1.633929,-0.944263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0,208500.0
1,0.017501,0.0,0.683272,1.343668,-0.052365,0.704296,0.61452,0.0,0.0,0.787362,1.266019,-0.523677,0.161764,-0.481947,0.294737,-0.680593,1.157781,0.704787,-0.923836,0.0,0.0,0.744924,0.0,0.0,0.273711,0.356986,0.849429,0.101006,-0.43437,0.0,0.653431,0.918998,-0.235844,0.0,1.006224,0.785851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,0.0,2.0,2.0,2.0,2.0,2.0,223500.0
2,0.950138,0.0,0.637886,1.794358,1.161849,0.664347,1.419903,0.0,0.0,0.787362,1.266019,-0.523677,0.161764,0.275791,1.814814,0.09755,1.157781,1.927567,1.035252,0.0,0.0,1.851281,0.0,0.0,1.76839,0.356986,2.66151,0.514611,-0.302442,0.0,0.551281,1.96864,2.055408,0.0,2.137718,0.785851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,0.0,2.0,2.0,2.0,2.0,2.0,250000.0


In [33]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557 entries, 0 to 556
Columns: 200 entries, open_porch_sf to sale_price
dtypes: float64(200)
memory usage: 870.4 KB


### Test Data

In [34]:
# Transform data
transformed_data = prep_stage_2.fit_transform(test_df)

# Membuat DataFrame dengan kolom hasil transformasi
# ====================================================
# Mendapatkan nama kolom baru untuk OneHotEncoder
categorical_feature_names = prep_stage_2.named_transformers_["cat"].get_feature_names_out(one_hot_encoding_cols)

# Gabungkan semua nama kolom
all_columns = (
    numeric_cols +
    list(categorical_feature_names) +
    ordinal_encoding_cols +
    list(test_df.columns.difference(numeric_cols + one_hot_encoding_cols + ordinal_encoding_cols))
)

# Membuat DataFrame dengan nama kolom yang sesuai
test_df = pd.DataFrame(transformed_data, columns = all_columns)

# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(f'Test Stage 2 Check: {null_columns}')
test_df.tail(3)

Test Stage 2 Check: Series([], dtype: int64)


Unnamed: 0,open_porch_sf,misc_val,garage_yr_blt,2nd_flr_sf,lot_frontage,year_built,overall_qual,3ssn_porch,pool_area,full_bath,half_bath,overall_cond,yr_sold,total_bsmt_sf,bedroom_abv_gr,1st_flr_sf,bsmt_full_bath,garage_area,wood_deck_sf,kitchen_abv_gr,bsmt_fin_sf2,lot_area,low_qual_fin_sf,screen_porch,garage_cars,mssub_class,mas_vnr_area,bsmt_fin_sf1,bsmt_unf_sf,enclosed_porch,year_remod_add,gr_liv_area,tot_rms_abv_grd,bsmt_half_bath,mo_sold,fireplaces,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,lot_config_Corner,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside,roof_matl_CompShg,alley_Grvl,alley_Pave,pool_qc_Ex,paved_drive_N,paved_drive_P,paved_drive_Y,bsmt_cond_Fa,bsmt_cond_Gd,bsmt_cond_TA,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,exterior1st_AsbShng,exterior1st_BrkFace,exterior1st_CemntBd,exterior1st_HdBoard,exterior1st_MetalSd,exterior1st_Plywood,exterior1st_Stucco,exterior1st_VinylSd,exterior1st_Wd Sdng,exterior1st_WdShing,land_contour_Bnk,land_contour_HLS,land_contour_Low,land_contour_Lvl,foundation_BrkTil,foundation_CBlock,foundation_PConc,utilities_AllPub,mas_vnr_type_BrkCmn,mas_vnr_type_BrkFace,mas_vnr_type_Stone,lot_shape_IR1,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,bsmt_fin_type2_Unf,bsmt_fin_type1_ALQ,bsmt_fin_type1_BLQ,bsmt_fin_type1_GLQ,bsmt_fin_type1_LwQ,bsmt_fin_type1_Rec,bsmt_fin_type1_Unf,central_air_N,central_air_Y,street_Grvl,street_Pave,condition1_Artery,condition1_Feedr,condition1_Norm,condition1_PosA,condition1_PosN,condition1_RRAe,condition1_RRAn,condition1_RRNe,condition1_RRNn,condition2_Artery,condition2_Feedr,condition2_Norm,condition2_PosN,land_slope_Gtl,land_slope_Mod,garage_type_2Types,garage_type_Attchd,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,heating_GasA,heating_GasW,garage_finish_Fin,garage_finish_RFn,garage_finish_Unf,misc_feature_Shed,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2Story,house_style_SFoyer,house_style_SLvl,bsmt_exposure_Av,bsmt_exposure_Gd,bsmt_exposure_Mn,bsmt_exposure_No,bldg_type_1Fam,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE,fence_GdPrv,fence_GdWo,fence_MnPrv,fence_MnWw,electrical_FuseA,electrical_FuseF,electrical_SBrkr,neighborhood_Blmngtn,neighborhood_Blueste,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_IDOTRR,neighborhood_MeadowV,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,exterior2nd_AsbShng,exterior2nd_BrkFace,exterior2nd_CmentBd,exterior2nd_HdBoard,exterior2nd_ImStucc,exterior2nd_MetalSd,exterior2nd_Plywood,exterior2nd_Stucco,exterior2nd_VinylSd,exterior2nd_Wd Sdng,exterior2nd_Wd Shng,mszoning_C (all),mszoning_FV,mszoning_RH,mszoning_RL,mszoning_RM,sale_condition_Abnorml,sale_condition_AdjLand,sale_condition_Family,sale_condition_Normal,sale_condition_Partial,functional_Min1,functional_Min2,functional_Mod,functional_Typ,garage_cond,fireplace_qu,heating_qc,exter_cond,garage_qual,exter_qual,bsmt_qual,kitchen_qual
540,-0.133651,0.0,0.822156,-0.812116,2.068077,0.812962,1.276933,0.0,0.0,0.709856,-0.841214,-0.531269,-1.373927,2.019005,0.303314,2.746259,1.202475,1.286168,0.744445,0.0,0.0,1.607988,0.0,0.0,1.674376,-0.95832,1.206406,2.247367,-0.632378,0.0,0.811425,1.354196,1.405159,0.0,1.91991,2.652834,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,3.0,3.0,2.0,0.0,2.0
541,1.739479,0.0,-0.841037,-0.812116,-0.773151,-0.632583,-1.024089,0.0,0.0,-1.266821,-0.841214,2.382659,-1.373927,-0.594636,0.303314,-0.801974,1.202475,-1.095524,-0.948804,0.0,0.0,0.316432,0.0,0.0,-1.282327,-0.95832,-0.713661,0.041956,-0.522448,0.0,0.659946,-1.532072,-1.088309,0.0,1.544366,-0.975602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,0.0,3.0,3.0,3.0,3.0,3.0
542,0.09478,0.0,0.282742,1.65118,0.455488,0.344136,0.509926,0.0,0.0,0.709856,1.171029,-0.531269,-1.373927,-0.19533,0.303314,-0.40337,-0.831618,0.928355,0.921646,0.0,0.0,0.112166,0.0,0.0,1.674376,0.33976,0.197886,0.81106,-0.974384,0.0,0.205507,1.460504,2.236315,0.0,1.91991,0.838616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,4.0,0.0,3.0,3.0,3.0,2.0,3.0


In [35]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543 entries, 0 to 542
Columns: 203 entries, open_porch_sf to kitchen_qual
dtypes: float64(203)
memory usage: 861.3 KB


### Data for modeling

In [36]:
# Memisahkan kolom target dari data
# Mencari kolom target di train_df
train_target_col = train_df.filter(like='mszoning_').columns

if len(train_target_col) > 0:
    train_target_col = train_target_col[0]  # Mengambil kolom pertama yang cocok
    
    # Pastikan kolom target ada di train_df
    if train_target_col in train_df.columns:
        X_train = train_df.drop(columns=[train_target_col])
        y_train = train_df[train_target_col]
    else:
        raise ValueError(f"Kolom target '{train_target_col}' tidak ditemukan di train_df.")
else:
    raise ValueError("Kolom dengan filter 'mszoning_' tidak ditemukan di train_df.")

# Mencari kolom target di test_df
test_target_col = test_df.filter(like='mszoning_').columns

if len(test_target_col) > 0:
    test_target_col = test_target_col[0]  # Mengambil kolom pertama yang cocok
    
    # Pastikan kolom target ada di test_df
    if test_target_col in test_df.columns:
        X_test = test_df.drop(columns=[test_target_col])
        y_test = test_df[test_target_col]
    else:
        raise ValueError(f"Kolom target '{test_target_col}' tidak ditemukan di test_df.")
else:
    raise ValueError("Kolom dengan filter 'mszoning_' tidak ditemukan di test_df.")

# Validasi tambahan
print("Shape X_train:", X_train.shape)
print("Shape y_train:", y_train.shape)
print("Shape X_test:", X_test.shape)
print("Shape y_test:", y_test.shape)

Shape X_train: (557, 199)
Shape y_train: (557,)
Shape X_test: (543, 202)
Shape y_test: (543,)


# Modeling

## Set Parameter

In [37]:
# Membuat pipeline model
pipelines = {
    'RandomForest': Pipeline(steps = [
        ('pca', PCA()),
        ('classifier', RandomForestClassifier(random_state = 42))
    ]),

    'LogisticRegression': Pipeline(steps = [
        ('pca', PCA()),
        ('classifier', LogisticRegression(max_iter = 1000, random_state = 42))
    ]),
    
    'SVC': Pipeline(steps = [
        ('pca', PCA()),
        ('classifier', SVC(random_state = 42))
    ]),
    
    'GradientBoosting': Pipeline(steps = [
        ('pca', PCA()),
        ('classifier', GradientBoostingClassifier(random_state = 42))
    ]),
    
    'XGBoost': Pipeline(steps = [
        ('pca', PCA()),
        ('classifier', XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss', random_state = 42))
    ])
}

In [38]:
# Parameter grid untuk GridSearchCV
param_grids = {
    'RandomForest': {
        'pca__n_components': [0.90, 0.95, 0.99],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__bootstrap': [True, False],
    },
    'LogisticRegression': [
        {
            'pca__n_components': [0.90, 0.95, 0.99],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__solver': ['liblinear'],  # Cocok untuk l1 dan l2
            'classifier__max_iter': [100, 500, 1000],
        },
        {
            'pca__n_components': [0.90, 0.95, 0.99],
            'classifier__penalty': ['elasticnet'],
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__solver': ['saga'],  # Cocok untuk elasticnet
            'classifier__max_iter': [100, 500, 1000],
            'classifier__l1_ratio': [0.1, 0.5, 0.9],
        }
    ],
    'SVC': {
        'pca__n_components': [0.90, 0.95, 0.99],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'classifier__degree': [2, 3, 4],
        'classifier__gamma': ['scale', 'auto'],
    },
    'GradientBoosting': {
        'pca__n_components': [0.90, 0.95, 0.99],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__subsample': [0.8, 0.9, 1.0],
    },
    'XGBoost': {
        'pca__n_components': [0.90, 0.95, 0.99],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
        'classifier__min_child_weight': [1, 3, 5],
        'classifier__gamma': [0, 0.1, 0.5],
        'classifier__subsample': [0.8, 0.9, 1.0],
        'classifier__colsample_bytree': [0.8, 0.9, 1.0],
    },
}

## Checking Null and Infinite

In [39]:
# Cek NaN dan Inf di X_train
print(f"Jumlah NaN di X_train: {pd.isna(X_train).sum().sum()}")
X_train_numeric = X_train.select_dtypes(include = ['number'])
print(f"Jumlah Inf di X_train: {(np.isinf(X_train_numeric).sum().sum())} \n")

# Cek NaN dan Inf di y_train
print(f"Jumlah NaN di y_train: {pd.isna(y_train).sum()}")
y_train_numeric = y_train
print(f"Jumlah Inf di y_train: {(np.isinf(y_train_numeric).sum().sum())}")

Jumlah NaN di X_train: 0
Jumlah Inf di X_train: 0 

Jumlah NaN di y_train: 0
Jumlah Inf di y_train: 0


In [40]:
# Cek NaN dan Inf di X_test
print(f"Jumlah NaN di X_test: {pd.isna(X_test).sum().sum()}")
X_test_numeric = X_test.select_dtypes(include = ['number'])
print(f"Jumlah Inf di X_test: {(np.isinf(X_test_numeric).sum().sum())} \n")

Jumlah NaN di X_test: 0
Jumlah Inf di X_test: 0 



## Comparing stage 2

In [41]:
# 
not_in_x_test = set(X_train.columns) - set(X_test.columns)
not_in_x_train = set(X_test.columns) - set(X_train.columns)

print(f"Kolom yang ada di X_train tapi tidak ada di X_test: {not_in_x_test}")
print(f"Kolom yang ada di X_test tapi tidak ada di X_train: {not_in_x_train}")

Kolom yang ada di X_train tapi tidak ada di X_test: {'heating_Grav', 'foundation_Wood', 'neighborhood_NPkVill', 'exterior2nd_Stone', 'functional_Maj2', 'functional_Maj1', 'sale_price', 'pool_qc_Gd'}
Kolom yang ada di X_test tapi tidak ada di X_train: {'pool_qc_Ex', 'neighborhood_Blueste', 'functional_Mod', 'condition2_PosN', 'condition1_PosA', 'garage_type_CarPort', 'condition1_RRNn', 'mszoning_FV', 'condition2_Feedr', 'condition2_Artery', 'garage_type_2Types'}


In [42]:
# 
X_train = X_train.drop(columns = not_in_x_test)
X_test = X_test.drop(columns = not_in_x_train)

# 
not_in_x_test = set(X_train.columns) - set(X_test.columns)
not_in_x_train = set(X_test.columns) - set(X_train.columns)

print(f"Kolom yang ada di X_train tapi tidak ada di X_test: {not_in_x_test}")
print(f"Kolom yang ada di X_test tapi tidak ada di X_train: {not_in_x_train}")

Kolom yang ada di X_train tapi tidak ada di X_test: set()
Kolom yang ada di X_test tapi tidak ada di X_train: set()


## Implement module

In [43]:
# Hyperparameter Tuning and Model Selection
best_models = {}
for model_name, pipeline in pipelines.items():
    print(f"\nTuning hyperparameters for {model_name}...")

    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv = 5, scoring = 'accuracy', n_jobs = -1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search
    
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best Cross-Validation Score for {model_name}: {grid_search.best_score_}")


Tuning hyperparameters for RandomForest...
Best Parameters for RandomForest: {'classifier__bootstrap': False, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200, 'pca__n_components': 0.99}
Best Cross-Validation Score for RandomForest: 0.9587194337194338

Tuning hyperparameters for LogisticRegression...
Best Parameters for LogisticRegression: {'classifier__C': 1, 'classifier__max_iter': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear', 'pca__n_components': 0.99}
Best Cross-Validation Score for LogisticRegression: 0.9963963963963964

Tuning hyperparameters for SVC...
Best Parameters for SVC: {'classifier__C': 0.1, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear', 'pca__n_components': 0.99}
Best Cross-Validation Score for SVC: 0.9963963963963964

Tuning hyperparameters for GradientBoosting...
Best Parameters for GradientBoosting: {'classifier__learning_

KeyboardInterrupt: 

## Evaluation and Tuning

In [None]:
# Model Evaluation on Test Data
for model_name, grid_search in best_models.items():
    print(f"\nEvaluating {model_name} on test data...")

    y_pred = grid_search.best_estimator_.predict(X_test)

    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, y_pred))