In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes, load_breast_cancer

# Load Data

In [4]:
diabetes = load_diabetes()
kanker = load_breast_cancer()

In [5]:
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

## Deksripsi dataset
`load_diabetes()` dari `sklearn.datasets` adalah dataset **regresi** tentang penyakit diabetes. Dataset ini punya **10 fitur numerik** (sudah di-*standardize*, rata-rata = 0, var = 1) yang merepresentasikan kondisi pasien.

Berikut representasi kolom/fiturnya:

| Fitur (nama) | Deskripsi                                                               |
| ------------ | ----------------------------------------------------------------------- |
| `age`        | Usia pasien                                                             |
| `sex`        | Jenis kelamin pasien                                                    |
| `bmi`        | Body Mass Index (indikator obesitas)                                    |
| `bp`         | Rata-rata tekanan darah                                                 |
| `s1`         | Serum total kolesterol                                                  |
| `s2`         | Serum LDL (low-density lipoprotein)                                     |
| `s3`         | Serum HDL (high-density lipoprotein)                                    |
| `s4`         | Serum trigliserida                                                      |
| `s5`         | Serum level logaritma (mungkin terkait gula darah atau kolesterol lain) |
| `s6`         | Gula darah (blood sugar level)                                          |

👉 **Target (y):** perkembangan penyakit setelah 1 tahun, berupa nilai numerik (indikator progresi diabetes).

Contoh penggunaannya:

```python
from sklearn.datasets import load_diabetes
import pandas as pd

data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print(X.head())
print(y[:5])
```

Kalau kamu mau, aku bisa bikinkan tabel ringkas dari `data.feature_names` langsung dalam bentuk `DataFrame` biar lebih jelas. Mau aku bikinkan?


# Model Regresi

In [6]:
diabetes_data = pd.DataFrame(
    diabetes.data,
    columns=diabetes.feature_names,
)

diabetes_data.head()


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [7]:
y = diabetes.target

In [8]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Pembagian dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_data,
    y,
    test_size=0.2,
    random_state=42
)

## Model

In [15]:
regresi = LinearRegression()
regresi.fit(X_train, y_train)

In [20]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge = Ridge()
grid = GridSearchCV(
    ridge, 
    param_grid, 
    cv=5,   # 5-fold cross validation
    scoring='neg_mean_squared_error'
)
grid.fit(X_train, y_train)

print("Best alpha:", grid.best_params_)
print("Best CV score (MSE):", -grid.best_score_)


Best alpha: {'alpha': 0.1}
Best CV score (MSE): 3125.1907746343486


In [21]:
best_ridge = grid.best_estimator_
y_pred = best_ridge.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)


Test MSE: 2856.4868876706546


In [32]:
rf = RandomForestRegressor(
    n_estimators=100,   # jumlah pohon
    max_depth=None,     # kedalaman pohon (None = bebas)
    random_state=42
)

rf.fit(X_train, y_train)

## Evaluasi

In [33]:
y_train_predict = rf.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
r2 = r2_score(y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = rf.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
r2 = r2_score(y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

The model performance for training set
--------------------------------------
RMSE is 0.07315056407934001
R2 score is 0.9771536993612032


The model performance for testing set
--------------------------------------
RMSE is 0.18487707311546958
R2 score is 0.8531093915343916


In [34]:
y_test_predict

array([1.  , 0.  , 0.  , 1.  , 1.  , 0.  , 0.  , 0.14, 0.19, 0.86, 0.7 ,
       0.01, 0.97, 0.1 , 1.  , 0.01, 0.94, 1.  , 1.  , 0.  , 0.9 , 1.  ,
       0.  , 1.  , 1.  , 0.98, 1.  , 0.78, 1.  , 0.  , 0.97, 1.  , 0.66,
       0.98, 1.  , 1.  , 0.28, 0.99, 0.  , 0.89, 1.  , 0.  , 1.  , 1.  ,
       0.72, 1.  , 0.8 , 0.7 , 1.  , 0.99, 0.  , 0.  , 0.81, 0.91, 1.  ,
       1.  , 1.  , 0.  , 0.1 , 1.  , 1.  , 0.  , 0.01, 0.97, 1.  , 0.96,
       0.  , 0.01, 1.  , 0.95, 0.02, 0.  , 1.  , 0.  , 0.97, 0.97, 0.96,
       0.59, 1.  , 0.89, 0.  , 1.  , 0.84, 0.  , 0.18, 0.02, 0.02, 0.  ,
       1.  , 1.  , 1.  , 0.74, 0.74, 0.9 , 1.  , 1.  , 0.  , 0.  , 1.  ,
       0.01, 0.08, 1.  , 0.06, 0.  , 0.99, 1.  , 0.99, 0.  , 0.26, 0.92,
       0.  , 1.  , 0.79, 0.  , 0.67, 0.  , 1.  , 0.83, 0.99, 0.  , 0.56,
       1.  , 1.  , 0.  , 0.99, 0.  , 0.  , 1.  , 1.  , 0.  , 0.25, 0.  ,
       0.83, 1.  , 0.81, 0.17, 0.31, 1.  , 0.61, 0.02, 1.  , 0.  , 1.  ,
       1.  , 0.  , 1.  , 0.  , 0.  , 0.28, 1.  , 0.

# Model Klasifikasi

In [35]:
kanker_data = pd.DataFrame(kanker.data,
                      columns = kanker.feature_names)
kanker_data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [36]:
y = kanker.target

## Pembagian dataset

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    kanker_data, y, test_size=0.3, random_state=42)

## Model

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# reglog = LogisticRegression()
# reglog.fit(X_train, y_train)

rf = RandomForestClassifier(
    n_estimators=100,   # jumlah pohon
    max_depth=None,     # kedalaman pohon (None = bebas)
    random_state=42
)

rf.fit(X_train, y_train)

## Evaluasi

In [39]:
y_train_predict = rf.predict(X_train)
akurasi = (accuracy_score(y_train, y_train_predict))

print("The model performance for training set")
print("--------------------------------------")
print('Akurasi adalah {}'.format(akurasi))

y_test_predict = rf.predict(X_test)
akurasi = (accuracy_score(y_test, y_test_predict))

print("The model performance for test set")
print("--------------------------------------")
print('Akurasi adalah {}'.format(akurasi))


The model performance for training set
--------------------------------------
Akurasi adalah 1.0
The model performance for test set
--------------------------------------
Akurasi adalah 0.9707602339181286


In [28]:
print((classification_report(y_test, y_test_predict)))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



In [29]:
print((confusion_matrix(y_test, y_test_predict)))

[[ 59   4]
 [  1 107]]
