In [28]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [29]:
iris_data = load_iris()
X = iris_data.data
y = iris_data.target

In [30]:
# Membagi dataset menjadi data training dan data testing.
# train_test_split akan memisahkan:
#   - X_train : 80% data fitur untuk melatih model
#   - X_test  : 20% data fitur untuk menguji model
#   - y_train : 80% label target untuk training
#   - y_test  : 20% label target untuk testing
#
# test_size=0.2  → 20% data dijadikan data uji
# random_state=42 → agar pembagian data selalu sama setiap dijalankan

      #             ┌──────────────────────────────────────┐
      #             │        Dataset Lengkap (X, y)        │
      #             │   Fitur: X                           │
      #             │   Label: y                           │
      #             └───────────────────┬──────────────────┘
      #                                 │
      #                                 │  train_test_split
      #                                 │  (test_size = 0.2)
      #                                 ▼
      # ┌─────────────────────────────────────────────────────────────────┐
      # │                                                                 │
      # │                       Pembagian Dataset                         │
      # │                                                                 │
      # └───────────────┬───────────────────────────────┬─────────────────┘
      #                 │                               │
      #                 ▼                               ▼
      #   ┌──────────────────────┐           ┌──────────────────────┐
      #   │     Data Training    │           │      Data Testing    │
      #   │       (80%)          │           │        (20%)         │
      #   └──────────┬───────────┘           └──────────┬───────────┘
      #              │                                    │
      #              │                                    │
      #      ┌───────▼────────┐                   ┌───────▼─────────┐
      #      │ X_train (80%)  │                   │ X_test (20%)    │
      #      │ fitur training │                   │ fitur testing   │
      #      └────────────────┘                   └─────────────────┘
      #               │                                     │
      #               ▼                                     ▼
      #      ┌────────────────┐                    ┌────────────────┐
      #      │ y_train (80%)  │                    │ y_test (20%)   │
      #      │ label training │                    │ label testing  │
      #      └────────────────┘                    └────────────────┘

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled_array = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled_array, columns=iris_data.feature_names)
X_test_scaled_array = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled_array, columns=iris_data.feature_names)

In [34]:
print("=== SEBELUM SCALING ===")
print(pd.DataFrame(X_train, columns=iris_data.feature_names).describe())

print("=== SETELAH SCALING ===")
print(X_train_scaled.describe())

=== SEBELUM SCALING ===
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         120.000000        120.000000         120.000000   
mean            5.809167          3.061667           3.726667   
std             0.823805          0.449123           1.752345   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.500000   
50%             5.750000          3.000000           4.250000   
75%             6.400000          3.400000           5.100000   
max             7.700000          4.400000           6.700000   

       petal width (cm)  
count        120.000000  
mean           1.183333  
std            0.752289  
min            0.100000  
25%            0.300000  
50%            1.300000  
75%            1.800000  
max            2.500000  
=== SETELAH SCALING ===
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count       1.200000e+02      1.200000e+02       1.200000e+02   
me

In [35]:
print("Contoh data sebelum scaling:")
print(pd.DataFrame(X_train, columns=iris_data.feature_names).head())

print("\nContoh data setelah scaling:")
print(X_train_scaled.head())

Contoh data sebelum scaling:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                4.6               3.6                1.0               0.2
1                5.7               4.4                1.5               0.4
2                6.7               3.1                4.4               1.4
3                4.8               3.4                1.6               0.2
4                4.4               3.2                1.3               0.2

Contoh data setelah scaling:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -1.473937          1.203658          -1.562535         -1.312603
1          -0.133071          2.992376          -1.276006         -1.045633
2           1.085898          0.085709           0.385858          0.289218
3          -1.230143          0.756479          -1.218701         -1.312603
4          -1.717731          0.309299          -1.390618         -1.312603


In [36]:
print("Mean yang dihitung scaler:")
print(scaler.mean_)

print("\nStandard deviation yang dihitung scaler:")
print(scaler.scale_)

Mean yang dihitung scaler:
[5.80916667 3.06166667 3.72666667 1.18333333]

Standard deviation yang dihitung scaler:
[0.82036535 0.44724776 1.74502786 0.74914766]
