<a href="https://colab.research.google.com/github/S4putri/221230005-Pengantar-ML/blob/main/week-02/latihan_praktikum_2_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🏋️ LATIHAN 2: OPERASI NUMPY UNTUK MACHINE LEARNING

DATA PREPROCESSING DENGAN NUMPY

In [None]:
import numpy as np

'''TODO: Implementasi Preprocessing Pipeline'''
# Dataset simulasi: 100 samples, 5 features
np.random.seed(42)  # Supaya hasil random bisa direproduksi
X = np.random.randn(100, 5) * 10 + 5  # Data dengan mean ≈ 5, std ≈ 10


# TODO 1: Normalisasi Z-score: (x - mean) / std
def z_score_normalization(data):
    mean = np.mean(data, axis=0)   # rata-rata tiap fitur
    std = np.std(data, axis=0)     # standar deviasi tiap fitur

    # Error handling: jika ada fitur dengan std=0, ganti std=1 agar tidak division by zero
    std_safe = np.where(std == 0, 1, std)

    normalized = (data - mean) / std_safe
    return normalized

X_normalized = z_score_normalization(X)


# TODO 2: Handle outliers - clip values di luar ±3 std
def handle_outliers(data, std_threshold=3):
    mean = np.mean(data, axis=0)   # mean tiap fitur
    std = np.std(data, axis=0)     # std tiap fitur

    lower_bound = mean - std_threshold * std
    upper_bound = mean + std_threshold * std

    # Gunakan np.clip (vectorized)
    cleaned = np.clip(data, lower_bound, upper_bound)
    return cleaned

X_cleaned = handle_outliers(X_normalized)


# TODO 3: One-hot encoding untuk label kategorikal
def one_hot_encoding(labels):
    if labels.size == 0:
        raise ValueError("Labels tidak boleh kosong.")

    classes, encoded = np.unique(labels, return_inverse=True)
    one_hot = np.zeros((labels.size, classes.size))
    one_hot[np.arange(labels.size), encoded] = 1
    return one_hot

labels = np.array([0, 1, 2, 0, 1, 2, 0])
one_hot_labels = one_hot_encoding(labels)    # hasil one-hot encoding


# TODO 4: Train-test split manual
def train_test_split_numpy(X, y, test_size=0.2):
    if not (0 < test_size < 1):
        raise ValueError("test_size harus antara 0 dan 1.")

    n_samples = X.shape[0]                # jumlah data
    n_test = int(n_samples * test_size)   # jumlah data test

    indices = np.random.permutation(n_samples)  # shuffle index
    test_idx = indices[:n_test]                 # ambil index test
    train_idx = indices[n_test:]                # ambil index train

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

# Contoh label acak untuk split
y = np.random.randint(0, 3, 100)
X_train, X_test, y_train, y_test = train_test_split_numpy(X, y, test_size=0.2) # hasil spilt


# ✅ Validasi hasil
assert X_normalized.shape == X.shape, "Shape harus tetap sama"
assert np.allclose(X_normalized.mean(axis=0), 0, atol=1e-10), "Mean harus ~0 setelah z-score"
assert np.allclose(X_normalized.std(axis=0), 1, atol=1e-10), "Std harus ~1 setelah z-score"
assert len(X_train) + len(X_test) == len(X), "Jumlah data harus konsisten"

# Output ringkas
print("Original data shape:", X.shape)
print("Normalized data mean:", X_normalized.mean(axis=0).round(3))
print("Normalized data std:", X_normalized.std(axis=0).round(3))
print("One-hot labels shape:", one_hot_labels.shape)
print("One-hot labels:\n", one_hot_labels)
print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)
print("✅ NumPy operations completed")


Original data shape: (100, 5)
Normalized data mean: [-0.  0. -0. -0.  0.]
Normalized data std: [1. 1. 1. 1. 1.]
One-hot labels shape: (7, 3)
One-hot labels:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
Train shape: (80, 5) | Test shape: (20, 5)
✅ NumPy operations completed
