# Sample 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro

In [None]:
# Contoh data pendapatan rumah tangga
np.random.seed(42)
low_income = np.random.exponential(scale=20000, size=1000)  # Mayoritas pendapatan rendah
high_income = np.random.normal(loc=100000, scale=20000, size=50)  # Pendapatan tinggi (outlier)
income_data = np.concatenate([low_income, high_income])

income_data = pd.DataFrame(income_data)
income_data.columns = ['income']
income_data.head()

In [None]:
# Visualisasi histogram
plt.figure(figsize=(8, 5))
plt.hist(income_data, bins=50, color='blue', alpha=0.7, edgecolor='black')
plt.title('Distribusi Pendapatan Rumah Tangga')
plt.xlabel('Pendapatan')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
# Shapiro-Wilk Test
stat, p = shapiro(income_data)
print(f"Shapiro-Wilk Test: Statistic={stat}, p-value={p}")

if p > 0.05:
    print("Data berdistribusi normal")
else:
    print("Data tidak berdistribusi normal")

In [None]:
# Transformasi log
income_data_log = np.log(income_data + 1)  # Tambahkan 1 untuk menghindari log(0)

# Visualisasi distribusi setelah transformasi
plt.figure(figsize=(8, 5))
plt.hist(income_data_log, bins=50, color='green', alpha=0.7, edgecolor='black')
plt.title('Distribusi Pendapatan Setelah Transformasi Log')
plt.xlabel('Log(Pendapatan)')
plt.ylabel('Frekuensi')
plt.show()

# Uji normalitas lagi
stat, p = shapiro(income_data_log)
print(f"Setelah Transformasi Log - Shapiro-Wilk Test: Statistic={stat}, p-value={p}")
if p > 0.05:
    print("Data berdistribusi normal setelah transformasi")
else:
    print("Data tetap tidak berdistribusi normal")

In [None]:
from sklearn.cluster import KMeans

# Clustering pada data transformasi
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(np.array(income_data_log).reshape(-1, 1))

# Visualisasi hasil clustering
plt.figure(figsize=(8, 5))
plt.scatter(range(len(income_data_log)), income_data_log, c=clusters, cmap='viridis')
plt.title('Clustering Pendapatan Rumah Tangga')
plt.xlabel('Index Data')
plt.ylabel('Log(Pendapatan)')
plt.show()

In [8]:
# Memisahkan data berdasarkan cluster
cluster_0 = income_data_log[np.array(clusters) == 0]
cluster_1 = income_data_log[np.array(clusters) == 1]
cluster_2 = income_data_log[np.array(clusters) == 2]


In [None]:
from scipy.stats import shapiro

# Uji normalitas untuk setiap cluster
for i, cluster_data in enumerate([cluster_0, cluster_1, cluster_2]):
    stat, p = shapiro(cluster_data)
    print(f"Cluster {i}: Shapiro-Wilk Test Statistic={stat:.4f}, p-value={p:.4f}")
    if p > 0.05:
        print(f"Cluster {i} berdistribusi normal")
    else:
        print(f"Cluster {i} tidak berdistribusi normal")


In [None]:
import seaborn as sns

# Visualisasi distribusi per cluster
plt.figure(figsize=(12, 8))
for i, cluster_data in enumerate([cluster_0, cluster_1, cluster_2]):
    sns.histplot(cluster_data, bins=30, kde=True, label=f'Cluster {i}', alpha=0.6)

plt.title('Distribusi Log Pendapatan Per Cluster')
plt.xlabel('Log(Pendapatan)')
plt.ylabel('Frekuensi')
plt.legend()
plt.show()


# Sample 3

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# Dataset contoh
data = {
    "Luas_Rumah": [50, 80, 120, 200, 50, 100, 1200, 80, np.nan],
    "Jumlah_Kamar": [2, 3, 4, 5, 2, 3, 6, 3, 3],
    "Lokasi": ["Pusat", "Pinggir", "Pinggir", "Pusat", "Pusat", "Pinggir", "Pinggir", "Pusat", np.nan],
    "Usia_Bangunan": [5, 10, 15, 20, 5, 10, 3, 8, 10],
    "Harga_Rumah": [500, 700, 1000, 1500, 500, 800, 5000, 600, 700]
}
df = pd.DataFrame(data)

In [3]:
# Menghapus duplikasi
df = df.drop_duplicates()

# Menangani null values
df['Luas_Rumah'].fillna(df['Luas_Rumah'].median(), inplace=True)
df['Lokasi'].fillna(df['Lokasi'].mode()[0], inplace=True)

# Mengatasi outliers dengan metode IQR
Q1 = df['Luas_Rumah'].quantile(0.25)
Q3 = df['Luas_Rumah'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Luas_Rumah'] >= lower_bound) & (df['Luas_Rumah'] <= upper_bound)]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Luas_Rumah'].fillna(df['Luas_Rumah'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Lokasi'].fillna(df['Lokasi'].mode()[0], inplace=True)


In [4]:
# Membagi data
X = df.drop(columns=['Harga_Rumah'])
y = df['Harga_Rumah']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definisi fitur numerik dan kategori
num_features = ['Luas_Rumah', 'Jumlah_Kamar', 'Usia_Bangunan']
cat_features = ['Lokasi']

In [5]:
# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

In [6]:
# Ridge Regression
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

# Hyperparameter Grid untuk GridSearchCV
param_grid_ridge = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__max_iter': [50000, 100000, 200000],
    'model__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 
    'model__fit_intercept': [True, False],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__random_state': [None, 42]
}


ridge_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param_grid_ridge,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score= 'raise',
    verbose=1
)
ridge_search.fit(X_train, y_train)
print(f"Best Ridge Parameters: {ridge_search.best_params_}")

Fitting 5 folds for each of 1260 candidates, totalling 6300 fits
Best Ridge Parameters: {'model__alpha': 0.01, 'model__fit_intercept': True, 'model__max_iter': 50000, 'model__random_state': None, 'model__solver': 'saga', 'model__tol': 0.01}


In [7]:
# Lasso Regression
lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso())
])

param_grid_lasso = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__max_iter': [50000, 100000, 200000],
    'model__fit_intercept': [True, False],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__selection': ['cyclic', 'random'],
    'model__random_state': [None, 42]
}

lasso_search = GridSearchCV(
    estimator=lasso_pipeline,
    param_grid=param_grid_lasso,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score = 'raise', 
    verbose=1
)
lasso_search.fit(X_train, y_train)
print(f"Best Lasso Parameters: {lasso_search.best_params_}")

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best Lasso Parameters: {'model__alpha': 1, 'model__fit_intercept': True, 'model__max_iter': 200000, 'model__random_state': None, 'model__selection': 'random', 'model__tol': 0.0001}


In [10]:
# ElasticNet Regression
elasticnet_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet())
])

param_grid_elasticnet = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'model__fit_intercept': [True, False],
    'model__max_iter': [50000, 100000, 200000],
    'model__tol': [1e-4, 1e-3, 1e-2],
    'model__random_state': [None, 42]
}

elasticnet_search = GridSearchCV(
    estimator=elasticnet_pipeline,
    param_grid=param_grid_elasticnet,
    scoring='neg_mean_absolute_error',
    cv=5,
    error_score = 'raise', 
    verbose=1
)
elasticnet_search.fit(X_train, y_train)
print(f"Best ElasticNet Parameters: {elasticnet_search.best_params_}")

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Best ElasticNet Parameters: {'model__alpha': 0.1, 'model__fit_intercept': True, 'model__l1_ratio': 0.9, 'model__max_iter': 50000, 'model__random_state': None, 'model__tol': 0.01}


In [11]:
# Membuat pipeline dengan parameter terbaik
ridge_best = ridge_search.best_estimator_
lasso_best = lasso_search.best_estimator_
elasticnet_best = elasticnet_search.best_estimator_

# Linear Regression tetap default
linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Voting Regressor dengan model terbaik
voting_regressor = VotingRegressor(
    estimators=[
        ('linear', linear_pipeline),
        ('ridge', ridge_best),
        ('lasso', lasso_best),
        ('elasticnet', elasticnet_best)
    ]
)

In [12]:
# Training Voting Regressor
voting_regressor.fit(X_train, y_train)

# Prediksi dan Evaluasi
y_pred = voting_regressor.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (Voting Regressor): {mae:.2f}")

Mean Absolute Error (Voting Regressor): 97.12
