In [None]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Sample

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Contoh dataset (train_df dan test_df)
train_df = pd.DataFrame({
    'LotArea': [9600, None, 12000, 8500],
    'SalePrice': [200000, 150000, None, 130000]
})

test_df = pd.DataFrame({
    'LotArea': [None, 8500, 6000, None],
    'SalePrice': [None, 170000, 120000, None]
})

# Menggabungkan train_df dan test_df untuk transformasi bersama
combined_df = pd.concat([train_df, test_df], keys = ['train', 'test'])

# Inisialisasi SimpleImputer (misalnya menggunakan mean untuk nilai numerik)
imputer = SimpleImputer(strategy='mean')

# Terapkan imputasi pada kolom numerik
combined_df[['LotArea', 'SalePrice']] = imputer.fit_transform(combined_df[['LotArea', 'SalePrice']])
print("Combine DataFrame:")
print(combined_df)

# Memisahkan kembali train_df dan test_df setelah transformasi
train_df = combined_df.xs(key = 'train')
test_df = combined_df.xs(key = 'test')

# Hasil imputasi
print("\nTrain DataFrame:")
print(train_df)
print("\nTest DataFrame:")
print(test_df)


# Sample2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Dataset contoh
data = {
    'size': [1500, 1800, 2400, 3000, 3500, None],
    'location': ['City', 'Suburb', 'City', 'Suburb', 'City', 'Suburb'],
    'price': [400000, 450000, 500000, 550000, 600000, 650000]
}

# Membuat DataFrame
df = pd.DataFrame(data)

# Memisahkan fitur dan target
X = df.drop('price', axis=1)
y = df['price']

# Membagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Mendefinisikan preprocessing untuk fitur numerik dan kategorikal
numeric_features = ['size']
categorical_features = ['location']

# Membuat transformer untuk preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputasi untuk missing values
    ('scaler', StandardScaler())  # Normalisasi data numerik
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputasi untuk missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Membuat ColumnTransformer untuk menerapkan preprocessing pada kolom yang tepat
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Membuat pipeline yang menggabungkan preprocessing dengan model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Model regresi Linear
])

# Melatih model dengan data training
model_pipeline.fit(X_train, y_train)

# Prediksi harga rumah pada data testing
y_pred = model_pipeline.predict(X_test)

# Menghitung error (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


# Sample8

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Kelas khusus untuk menghapus outlier menggunakan metode IQR
class OutlierFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Menghitung IQR dan menghapus outlier untuk setiap kolom numerik
        X_filtered = X.copy()
        for col in X_filtered.columns:
            Q1 = X_filtered[col].quantile(0.25)
            Q3 = X_filtered[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            # Hanya mempertahankan data yang tidak termasuk outlier
            X_filtered = X_filtered[(X_filtered[col] >= lower_bound) & (X_filtered[col] <= upper_bound)]
        return X_filtered

# Perbarui numerical_pipeline untuk menyertakan langkah outlier filter
numerical_pipeline = Pipeline(steps=[
    ('outlier_filter', OutlierFilter(threshold=1.5)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline dan preprocessor lainnya tetap sama
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Buat ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numeric_cols),
    ('ordinal', ordinal_pipeline, ordinal_encoding_cols),
    ('cat', categorical_pipeline, one_hot_encoding_cols)
], remainder='drop')

# Membuat pipeline yang menggabungkan preprocessing dengan model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


# Winsorization

In [5]:
import pandas as pd
from scipy.stats.mstats import winsorize

# Contoh DataFrame
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 100],  # Kolom dengan outlier
    'B': [10, 12, 14, 16, 18, 20, 22, 24, 26, 300]  # Kolom lain dengan outlier
}
df = pd.DataFrame(data)

# Winsorization pada setiap kolom numerik
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = winsorize(df[col], limits=[0.1, 0.1])

print("DataFrame setelah Winsorization:")
print(df)


DataFrame setelah Winsorization:
   A   B
0  2  12
1  2  12
2  3  14
3  4  16
4  5  18
5  6  20
6  7  22
7  8  24
8  9  26
9  9  26


# Base Estimator

## Transformer Kustom

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class AddFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, multiplier=1):
        self.multiplier = multiplier

    def fit(self, X, y=None):
        return self  # Fit tidak melakukan apa-apa

    def transform(self, X):
        # Menambahkan kolom baru berdasarkan operasi dengan multiplier
        new_feature = np.sum(X, axis=1).reshape(-1, 1) * self.multiplier
        return np.hstack((X, new_feature))


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = np.array([[1, 2], [3, 4], [5, 6]])

# Pipeline dengan transformer kustom
pipeline = Pipeline([
    ('add_feature', AddFeatureTransformer(multiplier=2)),
    ('scaler', StandardScaler())
])

X_transformed = pipeline.fit_transform(X)
print(X_transformed)


## Classifier Kustom

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class DummyClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, constant_class=0):
        self.constant_class = constant_class

    def fit(self, X, y):
        self.classes_ = np.unique(y)  # Simpan kelas unik
        return self

    def predict(self, X):
        return np.full((X.shape[0],), self.constant_class)

    def score(self, X, y):
        # Menggunakan akurasi sebagai metrik
        return np.mean(self.predict(X) == y)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Contoh dataset
X = np.random.rand(100, 2)
y = np.random.choice([0, 1], size=100)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Dummy classifier
dummy_clf = DummyClassifier(constant_class=1)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

# Evaluasi
print("Accuracy:", accuracy_score(y_test, y_pred))
