# Winsorization

In [None]:
import pandas as pd
from scipy.stats.mstats import winsorize

# Contoh DataFrame
data = {
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 250],  # Kolom dengan outlier
    'B': [10, 12, 14, 16, 18, 20, 22, 24, 26, 300, 700]  # Kolom lain dengan outlier
}
df = pd.DataFrame(data)

# Winsorization pada setiap kolom numerik
for col in df.select_dtypes(include = ['int', 'float']).columns:
    df[col] = winsorize(df[col], limits = [0.1, 0.1])

print("DataFrame setelah Winsorization:")
print(df)

DataFrame setelah Winsorization:
   A   B
0  2  12
1  2  12
2  3  14
3  4  16
4  5  18
5  6  20
6  7  22
7  8  24
8  9  26
9  9  26


# Distribution

In [None]:
import pandas as pd

# Misalkan kita punya dua kolom data kategorikal dalam format list atau DataFrame
jenis_kendaraan = ['Mobil', 'Mobil', 'Motor', 'Sepeda', 'Motor', 'Mobil', 'Sepeda']
status_rumah = ['Memiliki Rumah', 'Tidak Memiliki Rumah', 'Memiliki Rumah', 'Memiliki Rumah', 'Tidak Memiliki Rumah', 'Tidak Memiliki Rumah', 'Memiliki Rumah']

# Membuat tabel kontingensi
tabel_kontingensi = pd.crosstab(jenis_kendaraan, status_rumah)
print(type(tabel_kontingensi))
tabel_kontingensi


<class 'pandas.core.frame.DataFrame'>


col_0,Memiliki Rumah,Tidak Memiliki Rumah
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Mobil,1,2
Motor,1,1
Sepeda,2,0


In [None]:
# Menggunakan chi2_contingency pada tabel kontingensi
chi2, p, dof, expected = chi2_contingency(tabel_kontingensi)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")


Chi-Square Statistic: 2.236111111111111
P-Value: 0.32691484476680416


# Ensemble - Voting Methode

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
# Dataset contoh
data = {
    "Luas_Rumah": [50, 80, 120, 200, 50, 100, 1200, 80, np.nan],
    "Jumlah_Kamar": [2, 3, 4, 5, 2, 3, 6, 3, 3],
    "Lokasi": ["Pusat", "Pinggir", "Pinggir", "Pusat", "Pusat", "Pinggir", "Pinggir", "Pusat", np.nan],
    "Usia_Bangunan": [5, 10, 15, 20, 5, 10, 3, 8, 10],
    "Harga_Rumah": [500, 700, 1000, 1500, 500, 800, 5000, 600, 700]
}
df = pd.DataFrame(data)

In [None]:
# Menghapus duplikasi
df = df.drop_duplicates()

# Menangani null values
df['Luas_Rumah'].fillna(df['Luas_Rumah'].median(), inplace=True)
df['Lokasi'].fillna(df['Lokasi'].mode()[0], inplace=True)

# Mengatasi outliers dengan metode IQR
Q1 = df['Luas_Rumah'].quantile(0.25)
Q3 = df['Luas_Rumah'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Luas_Rumah'] >= lower_bound) & (df['Luas_Rumah'] <= upper_bound)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Luas_Rumah'].fillna(df['Luas_Rumah'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Lokasi'].fillna(df['Lokasi'].mode()[0], inplace=True)


In [None]:
# Membagi data
X = df.drop(columns=['Harga_Rumah'])
y = df['Harga_Rumah']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definisi fitur numerik dan kategori
num_features = ['Luas_Rumah', 'Jumlah_Kamar', 'Usia_Bangunan']
cat_features = ['Lokasi']

In [None]:
# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

In [None]:
# Ridge Regression
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Hyperparameter Grid untuk GridSearchCV
param_grid_ridge = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__max_iter': [50000, 100000, 200000],
    'model__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'model__fit_intercept': [True, False],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__random_state': [None, 42]
}


ridge_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param_grid_ridge,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score= 'raise',
    verbose=1
)
ridge_search.fit(X_train, y_train)
print(f"Best Ridge Parameters: {ridge_search.best_params_}")

Fitting 5 folds for each of 1260 candidates, totalling 6300 fits
Best Ridge Parameters: {'model__alpha': 0.01, 'model__fit_intercept': True, 'model__max_iter': 50000, 'model__random_state': None, 'model__solver': 'saga', 'model__tol': 0.01}


In [None]:
# Lasso Regression
lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso())
])

param_grid_lasso = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__max_iter': [50000, 100000, 200000],
    'model__fit_intercept': [True, False],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__selection': ['cyclic', 'random'],
    'model__random_state': [None, 42]
}

lasso_search = GridSearchCV(
    estimator=lasso_pipeline,
    param_grid=param_grid_lasso,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score = 'raise', 
    verbose=1
)
lasso_search.fit(X_train, y_train)
print(f"Best Lasso Parameters: {lasso_search.best_params_}")

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best Lasso Parameters: {'model__alpha': 1, 'model__fit_intercept': True, 'model__max_iter': 200000, 'model__random_state': None, 'model__selection': 'random', 'model__tol': 0.0001}


In [None]:
# ElasticNet Regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

param_grid_elasticnet = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'model__fit_intercept': [True, False],
    'model__max_iter': [50000, 100000, 200000],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__random_state': [None, 42]
}

elasticnet_search = GridSearchCV(
    estimator=elasticnet_pipeline,
    param_grid=param_grid_elasticnet,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score = 'raise', 
    verbose=1
)
elasticnet_search.fit(X_train, y_train)
print(f"Best ElasticNet Parameters: {elasticnet_search.best_params_}")

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Best ElasticNet Parameters: {'model__alpha': 0.1, 'model__fit_intercept': True, 'model__l1_ratio': 0.9, 'model__max_iter': 50000, 'model__random_state': None, 'model__tol': 0.01}


In [None]:
# Membuat pipeline dengan parameter terbaik
ridge_best = ridge_search.best_estimator_
lasso_best = lasso_search.best_estimator_
elasticnet_best = elasticnet_search.best_estimator_

# Linear Regression tetap default
linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Voting Regressor dengan model terbaik
voting_regressor = VotingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('ridge', ridge_best),
        ('lasso', lasso_best),
        ('elasticnet', elasticnet_best)
    ]
)

In [None]:
# Training Voting Regressor
voting_regressor.fit(X_train, y_train)

# Prediksi dan Evaluasi
y_pred = voting_regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (Voting Regressor): {mae:.2f}")

Mean Absolute Error (Voting Regressor): 97.12
