# Library / Packages

In [None]:
# basic
import pandas as pd
import numpy as np
from scipy.stats import mstats

# data preparation
from sklearn.impute import SimpleImputer 
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# data scoring
from sklearn.metrics import mean_squared_error, r2_score

# data tuning

# visualization
import matplotlib.pyplot as plt

# Format

In [None]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9}B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6}M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3}K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [None]:
# === Custom Transformer untuk Menghapus Outlier ===
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.bounds = {}

    def fit(self, X, y=None):
        # Hitung batas IQR untuk setiap fitur numerik
        Q1 = X.quantile(0.25)
        Q3 = X.quantile(0.75)
        IQR = Q3 - Q1
        self.bounds = {"lower": Q1 - self.factor * IQR, 
                       "upper": Q3 + self.factor * IQR,}
        return self

    def transform(self, X, y=None):
        mask = ~((X < self.bounds["lower"]) | (X > self.bounds["upper"])).any(axis=1)
        return X[mask], y[mask] if y is not None else None

In [None]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Read Dataset

In [None]:
# Memuat data train dan test
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [None]:
# show all column
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Train Dataset

In [None]:
train_df.info()

In [None]:
# drop column
train_df = train_df.drop('Id', axis = 1)

# convert object if all numeric
train_df = convert_object_columns_to_numeric(train_df)

# check duplicate general data
print(f'Total General Duplicated: {train_df.duplicated().sum()}')

In [None]:
train_df.info()

In [None]:
# Mengelompokkan kolom yang memiliki nilai null ke dalam float_col dan str_col
null_numeric = []
null_obj = []

# 
null_columns = train_df.columns[train_df.isnull().sum() > 0]

for col in null_columns:
    if train_df[col].dtype in ['int', 'float']:
        null_numeric.append(col)
        
    elif train_df[col].dtype == 'object':
        null_obj.append(col)

print("Null Numeric:", null_numeric)
print("Null String:", null_obj)

In [None]:
# mempertahankan original columns
original = train_df.columns

In [None]:
# Pipeline untuk numerik: imputasi nilai null dengan median
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean"))
])

# Pipeline untuk kategori: imputasi nilai null dengan modus
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
preprocessor_stage1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, null_numeric), 
        ("cat", categorical_pipeline, null_obj), 
    ], remainder = "passthrough")

In [None]:
# Transform data menggunakan fit_transform pada tahap 1
train_df = preprocessor_stage1.fit_transform(train_df)

# implement original column
train_df = pd.DataFrame(train_df, columns = original)
train_df.tail()

In [None]:
# Pastikan train_df adalah DataFrame
if not isinstance(train_df, pd.DataFrame):
    train_df = pd.DataFrame(train_df)

# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(null_columns)

## Test Dataset

In [None]:
test_df.info()

In [None]:
# drop column
test_df = test_df.drop('Id', axis = 1)

# convert object if all numeric
test_df = convert_object_columns_to_numeric(test_df)

# check duplicate general data
print(f'Total General Duplicated: {test_df.duplicated().sum()}')

In [None]:
test_df.info()

In [None]:
# Mengelompokkan kolom yang memiliki nilai null ke dalam float_col dan str_col
null_numeric = []
null_obj = []

# 
null_columns = test_df.columns[test_df.isnull().sum() > 0]

for col in null_columns:
    if test_df[col].dtype in ['int', 'float']:
        null_numeric.append(col)
        
    elif test_df[col].dtype == 'object':
        null_obj.append(col)

print("Numeric Columns with Null Values:", null_numeric)
print("String Columns with Null Values:", null_obj)

In [None]:
# mempertahankan original columns
original = test_df.columns

In [None]:
# Pipeline untuk numerik: imputasi nilai null dengan median
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean"))
])

# Pipeline untuk kategori: imputasi nilai null dengan modus
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
preprocessor_stage1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, null_numeric),
        ("cat", categorical_pipeline, null_obj)
    ], remainder = "passthrough")

In [None]:
# Transform data menggunakan fit_transform pada tahap 1
test_df = preprocessor_stage1.fit_transform(test_df)

# implement original column
test_df = pd.DataFrame(test_df, columns = original)

# Konversi ulang tipe data jika perlu
for col in null_numeric:
    test_df[col] = pd.to_numeric(test_df[col], errors = 'coerce')

In [None]:
test_df.tail()

In [None]:
# Pastikan train_df adalah DataFrame
if not isinstance(test_df, pd.DataFrame):
    test_df = pd.DataFrame(test_df)

# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(null_columns)

# Preparation

In [None]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
                'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 
                'FireplaceQu', 'GarageQual', 'GarageCond'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
train_ordinal_encoding_cols = []
train_one_hot_encoding_cols = []
train_numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in train_df.columns:
    if train_df[col].dtype in ['int', 'float']:
        train_numeric_cols.append(col)

    elif train_df[col].dtype == 'object':
        if col in encoding_set:
            train_ordinal_encoding_cols.append(col)

        else:
            train_one_hot_encoding_cols.append(col)

# Menampilkan hasil
print("Label Encoding Columns:", train_ordinal_encoding_cols)
print("One-Hot Encoding Columns:", train_one_hot_encoding_cols)
print("Numeric Columns:", train_numeric_cols)

In [None]:
# Memisahkan kolom target dari data
target_col = 'SalePrice'

# Memastikan kolom target ada di dalam DataFrame sebelum mencoba memisahkannya
if target_col in train_df.columns:
    X_train = train_df.drop(columns = [target_col])
    y_train = train_df[target_col]

else:
    X_train = train_df  # Tidak memisahkan kolom target jika tidak ada
    y_train = None  # Set y_train ke None jika kolom target tidak ditemukan

if target_col in test_df.columns:
    X_test = test_df.drop(columns = [target_col])
    
else:
    X_test = test_df  # Tidak memisahkan kolom target jika tidak ada

In [None]:
# Identifikasi kolom untuk setiap jenis encoding
numeric_cols = train_numeric_cols
ordinal_encoding_cols = train_ordinal_encoding_cols
one_hot_encoding_cols = train_one_hot_encoding_cols

In [None]:
# Irisan pada kedua dataset
ordinal_encoding_cols = list(set(ordinal_encoding_cols) & set(X_train.columns) & set(X_test.columns))
one_hot_encoding_cols = list(set(one_hot_encoding_cols) & set(X_train.columns) & set(X_test.columns))
numeric_cols = list(set(numeric_cols) & set(X_train.columns) & set(X_test.columns))

In [None]:
# Definisikan pipeline untuk setiap tipe fitur
numerical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))
])

categorical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [None]:
# Standarisasi fitur numerik dan one-hot encoding fitur kategorikal
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False)
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)

preprocessor_stage2 = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_encoding_cols), 
        ("ord", ordinal_transformer, ordinal_encoding_cols)
    ], remainder = "passthrough")

# Modeling

In [None]:
# Membuat pipeline yang menggabungkan preprocessing dengan model
model_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor_stage2),
    ('regressor', LinearRegression())
])

In [None]:
# Definisikan parameter grid untuk GridSearchCV dengan beberapa model
param_grid = [
    {'regressor': [LinearRegression()]},
    {
        'regressor': [Ridge()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0], 
        'regressor__max_iter': [50000, 100000, 200000], 
        'regressor__tol': [1e-3, 1e-4, 1e-6] 
    },
    {
        'regressor': [Lasso()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0],
        'regressor__max_iter': [50000, 100000, 200000],
        'regressor__tol': [1e-3, 1e-4, 1e-6]
    },
]

In [None]:
# # Mengonversi semua kolom dalam X_train ke numerik, mengubah nilai yang tidak dapat dikonversi menjadi NaN
# X_train = X_train.apply(pd.to_numeric, errors='coerce')

# Mengecek jumlah nilai NaN dan inf di X_train dan y_train
print("Jumlah NaN di X_train:", pd.isna(X_train).sum().sum())
print("Jumlah inf di X_train:", np.isinf(X_train).sum().sum())
print("Jumlah NaN di y_train:", pd.isna(y_train).sum())

In [None]:
X_train.isnull().sum()

In [None]:
# Melakukan Grid Search
grid_search = GridSearchCV(model_pipeline, param_grid, cv = 5, scoring = 'neg_mean_squared_error', error_score = np.nan, verbose = 1)

# Memeriksa apakah y_train tidak None dan ukuran X_train sesuai
if y_train is not None and X_train.shape[0] == y_train.shape[0]:
    grid_search.fit(X_train, y_train)
    
else:
    raise ValueError("Ukuran X_train dan y_train tidak cocok atau y_train tidak tersedia.")

In [None]:
# Model terbaik dari Grid Search
best_model = grid_search.best_estimator_

# Prediksi harga rumah pada data testing menggunakan model terbaik
y_pred = best_model.predict(X_test)

In [None]:
# Menampilkan prediksi
print("Predicted prices:", y_pred)
print(f'Best parameters: {grid_search.best_params_}')

In [None]:
# # Menghitung error (Mean Squared Error) -> membutuhkan niai sebenrnya pada y_test
# mse = mean_squared_error(y_test, y_pred)
# print(f'Mean Squared Error: {mse}')
# print(f'Best parameters: {grid_search.best_params_}')

In [None]:
# # Visualisasi: Scatter plot dari nilai aktual vs prediksi
# plt.figure(figsize=(10, 6))
# plt.scatter(y_test, y_pred, alpha=0.5)
# plt.title("Actual vs Predicted Prices")
# plt.xlabel("Actual Prices")
# plt.ylabel("Predicted Prices")
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
# plt.show()