In [None]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Winsorization

In [None]:
import pandas as pd
from scipy.stats.mstats import winsorize

# Contoh DataFrame
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 250],  # Kolom dengan outlier
    'B': [10, 12, 14, 16, 18, 20, 22, 24, 26, 300, 700]  # Kolom lain dengan outlier
}
df = pd.DataFrame(data)

# Winsorization pada setiap kolom numerik
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = winsorize(df[col], limits=[0.1, 0.1])

print("DataFrame setelah Winsorization:")
print(df)

DataFrame setelah Winsorization:
   A   B
0  2  12
1  2  12
2  3  14
3  4  16
4  5  18
5  6  20
6  7  22
7  8  24
8  9  26
9  9  26


# Base Estimator

## Transformer Kustom

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class AddFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, multiplier=1):
        self.multiplier = multiplier

    def fit(self, X, y=None):
        return self  # Fit tidak melakukan apa-apa

    def transform(self, X):
        # Menambahkan kolom baru berdasarkan operasi dengan multiplier
        new_feature = np.sum(X, axis=1).reshape(-1, 1) * self.multiplier
        return np.hstack((X, new_feature))


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = np.array([[1, 2], [3, 4], [5, 6]])

# Pipeline dengan transformer kustom
pipeline = Pipeline([
    ('add_feature', AddFeatureTransformer(multiplier=2)),
    ('scaler', StandardScaler())
])

X_transformed = pipeline.fit_transform(X)
print(X_transformed)


## Classifier Kustom

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class DummyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, constant_class=0):
        self.constant_class = constant_class

    def fit(self, X, y):
        self.classes_ = np.unique(y)  # Simpan kelas unik
        return self

    def predict(self, X):
        return np.full((X.shape[0],), self.constant_class)

    def score(self, X, y):
        # Menggunakan akurasi sebagai metrik
        return np.mean(self.predict(X) == y)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Contoh dataset
X = np.random.rand(100, 2)
y = np.random.choice([0, 1], size=100)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dummy classifier
dummy_clf = DummyClassifier(constant_class=1)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

# Evaluasi
print("Accuracy:", accuracy_score(y_test, y_pred))


# Other

In [None]:
# null column
null_numeric = []
null_obj = []

# 
null_columns = train_df.columns[train_df.isnull().sum() > 0]

for col in null_columns:
    if train_df[col].dtype in ['int', 'float']:
        null_numeric.append(col)
        
    elif train_df[col].dtype == 'object':
        null_obj.append(col)

# 
print("Null Numeric:", null_numeric)
print("Null String:", null_obj)

In [None]:
# 
num_cols = []
obj_cols = []

for col in train_df:
    if train_df[col].dtype in ['int', 'float']:
        num_cols.append(col)
        
    elif train_df[col].dtype == 'object':
        obj_cols.append(col)

# 
print("Numeric Cols:", num_cols)
print("String Cols:", obj_cols)

In [None]:
# Original columns
train_original = train_df.columns

# Numeric Pipeline
numerical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean")), 
    ("outlier_removal", OutlierRemover(factor = 1.5))
])

# String Pipeline
categorical_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent"))
])

# ColumnTransformer untuk menggabungkan proses imputasi
prep_stage_1 = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, num_cols), 
        ("cat", categorical_pipeline, obj_cols), 
    ], 
    remainder = "drop", 
    verbose_feature_names_out = True)

In [None]:
# Transform data menggunakan fit_transform pada tahap 1
train_df = prep_stage_1.fit_transform(train_df)

# Columns After: ubah kembali ke DataFrame dengan kolom dari prep_stage_1
train_df = pd.DataFrame(train_df, columns = prep_stage_1.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in train_df.columns]
train_df.columns = clean_columns

In [None]:
# Menampilkan total null pada setiap kolom
null_columns = train_df.isnull().sum()[train_df.isnull().sum() > 0]
print(null_columns)
train_df.info()

In [None]:
train_df = convert_object_columns_to_numeric(train_df)
train_df.info()

In [None]:
# rows before filtering
print(f'Total Rows: {len(train_df)}')

# Filter kolom numerik
num_cols = train_df.select_dtypes(include = ["number"]).columns

# Pipeline untuk outlier remover hanya pada kolom numerik
outlier_pipeline = Pipeline(steps=[
    ("outlier_removal", OutlierRemover(factor=1.5))
])

# Transformasi data hanya pada kolom numerik
train_df[num_cols] = outlier_pipeline.fit_transform(train_df[num_cols])

# Output jumlah baris setelah transformasi
print(f'Total Rows: {len(train_df)}')

In [None]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 
                'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 
                'FireplaceQu', 'GarageQual', 'GarageCond'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
train_ordinal_cols = []
train_one_hot_cols = []
train_numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in train_df.columns:
    if train_df[col].dtype in ['int', 'float']:
        train_numeric_cols.append(col)

    elif train_df[col].dtype == 'object':
        if col in encoding_set:
            train_ordinal_cols.append(col)

        else:
            train_one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", train_ordinal_cols)
print("One-Hot Encoding Columns:", train_one_hot_cols)
print("Numeric Columns:", train_numeric_cols)