In [None]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Winsorization

In [None]:
import pandas as pd
from scipy.stats.mstats import winsorize

# Contoh DataFrame
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 250],  # Kolom dengan outlier
    'B': [10, 12, 14, 16, 18, 20, 22, 24, 26, 300, 700]  # Kolom lain dengan outlier
}
df = pd.DataFrame(data)

# Winsorization pada setiap kolom numerik
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = winsorize(df[col], limits=[0.1, 0.1])

print("DataFrame setelah Winsorization:")
print(df)

DataFrame setelah Winsorization:
   A   B
0  2  12
1  2  12
2  3  14
3  4  16
4  5  18
5  6  20
6  7  22
7  8  24
8  9  26
9  9  26


# Base Estimator

## Transformer Kustom

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class AddFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, multiplier=1):
        self.multiplier = multiplier

    def fit(self, X, y=None):
        return self  # Fit tidak melakukan apa-apa

    def transform(self, X):
        # Menambahkan kolom baru berdasarkan operasi dengan multiplier
        new_feature = np.sum(X, axis=1).reshape(-1, 1) * self.multiplier
        return np.hstack((X, new_feature))


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = np.array([[1, 2], [3, 4], [5, 6]])

# Pipeline dengan transformer kustom
pipeline = Pipeline([
    ('add_feature', AddFeatureTransformer(multiplier=2)),
    ('scaler', StandardScaler())
])

X_transformed = pipeline.fit_transform(X)
print(X_transformed)


## Classifier Kustom

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class DummyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, constant_class=0):
        self.constant_class = constant_class

    def fit(self, X, y):
        self.classes_ = np.unique(y)  # Simpan kelas unik
        return self

    def predict(self, X):
        return np.full((X.shape[0],), self.constant_class)

    def score(self, X, y):
        # Menggunakan akurasi sebagai metrik
        return np.mean(self.predict(X) == y)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Contoh dataset
X = np.random.rand(100, 2)
y = np.random.choice([0, 1], size=100)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dummy classifier
dummy_clf = DummyClassifier(constant_class=1)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

# Evaluasi
print("Accuracy:", accuracy_score(y_test, y_pred))


# Other

In [None]:
# Identifikasi kolom yang ada di train dan test
ordinal_encoding_cols = list(set(train_ordinal_cols) & set(test_ordinal_cols))
one_hot_encoding_cols = list(set(train_one_hot_cols) & set(test_one_hot_cols))
numeric_cols = list(set(train_numeric_cols) & set(test_numeric_cols))

print(f'ordinal cols: {ordinal_encoding_cols}')
print(f'one-hot cols: {one_hot_encoding_cols}')
print(f'numeric cols: {numeric_cols}')

# Preprocessing transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ordinal_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# ColumnTransformer untuk preprocessing data
prep_stage_2 = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_encoding_cols), 
        ("ord", ordinal_transformer, ordinal_encoding_cols)
    ], remainder="passthrough"
)

# Transform data untuk train
transformed_data = prep_stage_2.fit_transform(train_df)

# Mendapatkan nama kolom baru untuk OneHotEncoder
categorical_feature_names = prep_stage_2.named_transformers_["cat"].get_feature_names_out(one_hot_encoding_cols)

# Gabungkan semua nama kolom
all_columns = (
    numeric_cols +
    list(categorical_feature_names) +
    ordinal_encoding_cols +
    list(train_df.columns.difference(numeric_cols + one_hot_encoding_cols + ordinal_encoding_cols))
)

# Membuat DataFrame dengan kolom hasil transformasi
train_df = pd.DataFrame(transformed_data, columns=all_columns)

# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(f'Train Stage 2 Check: {null_columns}')
train_df.head(3)

# Transform data untuk test
transformed_data = prep_stage_2.transform(test_df)

# Membuat DataFrame dengan kolom hasil transformasi untuk test
test_df = pd.DataFrame(transformed_data, columns=all_columns)

# Menampilkan total null pada setiap kolom
null_columns = test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(f'Test Stage 2 Check: {null_columns}')
test_df.head(3)

# Memisahkan kolom target dari data
target_col = 'SalePrice'

if target_col in train_df.columns:
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col]
else:
    X_train = train_df  
    y_train = None  

if target_col in test_df.columns:
    X_test = test_df.drop(columns=[target_col])
else:
    X_test = test_df  

# Membuat pipeline yang menggabungkan preprocessing dengan model
model_pipeline = Pipeline(steps=[
    ('regressor', LinearRegression())
])

# Definisikan parameter grid untuk GridSearchCV dengan beberapa model
param_grid = [
    {'regressor': [LinearRegression()]},
    {
        'regressor': [Ridge()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0], 
        'regressor__max_iter': [50000, 100000, 200000], 
        'regressor__tol': [1e-3, 1e-4, 1e-6]
    },
    {
        'regressor': [Lasso()],
        'regressor__alpha': [0.1, 1.0, 100.0, 1000.0, 10000.0],
        'regressor__max_iter': [50000, 100000, 200000],
        'regressor__tol': [1e-3, 1e-4, 1e-6]
    },
]

# Melakukan Grid Search
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', error_score=np.nan, verbose=1)

# Mengecek jumlah nilai NaN dan inf di X_train dan y_train
print("Jumlah NaN di X_train:", pd.isna(X_train).sum().sum())
print("Jumlah NaN di y_train:", pd.isna(y_train).sum())

# Memeriksa apakah y_train tidak None dan ukuran X_train sesuai
if y_train is not None and X_train.shape[0] == y_train.shape[0]:
    grid_search.fit(X_train, y_train)
else:
    raise ValueError("Ukuran X_train dan y_train tidak cocok atau y_train tidak tersedia.")

# Model terbaik dari Grid Search
best_model = grid_search.best_estimator_

# Prediksi harga rumah pada data testing menggunakan model terbaik
y_pred = best_model.predict(X_test)

# Menampilkan prediksi
print("Predicted prices:", y_pred)
print(f'Best parameters: {grid_search.best_params_}')
