## Load Data

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Load dataset wbc.csv
df = pd.read_csv('../Dataset/wbc.csv')

print("Dataset Wisconsin Breast Cancer (WBC)")
print(df['diagnosis'].value_counts())

print("="*60)
print("\nDistribusi Diagnosis:")

print(df.head())
print(df.describe())

print("\nInformasi Dataset:")
print("\nDeskripsi Statistik:")
print(df.info())

Dataset Wisconsin Breast Cancer (WBC)
diagnosis
B    357
M    212
Name: count, dtype: int64

Distribusi Diagnosis:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030       

## Tugas 1: Pisahkan Variabel

Pisahkan antara variabel yang dapat digunakan dan variabel yang tidak dapat digunakan.

In [11]:
print("\n" + "="*60)
print("TUGAS 1: Pisahkan Variabel")
print("="*60)

print(f"\nShape sebelum drop: {df.shape}")
print("\nKolom yang tersedia:")
print(df.columns.tolist())

# Drop kolom id
df_clean = df.drop('id', axis=1)

print(f"\nShape setelah drop: {df_clean.shape}")
print(f"\nKolom yang tersisa ({len(df_clean.columns)}):")
print(df_clean.columns.tolist())

print("\nVariabel yang TIDAK digunakan: 'id' (hanya identifier)")
print("Variabel yang DIGUNAKAN: semua kolom kecuali 'id'")


TUGAS 1: Pisahkan Variabel

Shape sebelum drop: (569, 33)

Kolom yang tersedia:
['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

Shape setelah drop: (569, 32)

Kolom yang tersisa (32):
['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness

## Tugas 2: Encoding Diagnosis

Lakukan proses encoding pada kolom "diagnosis".

In [12]:
print("\n" + "="*60)
print("TUGAS 2: Encoding Diagnosis")
print("="*60)

print("\nSebelum encoding:")
print(df_clean['diagnosis'].value_counts())

# Encoding kolom diagnosis (M=1 untuk Malignant, B=0 untuk Benign)
le = LabelEncoder()
df_clean['diagnosis_encoded'] = le.fit_transform(df_clean['diagnosis'])

print("\nVerifikasi encoding:")
print(df_clean[['diagnosis', 'diagnosis_encoded']].head(10))

print("\nMapping encoding:")
for original, encoded in zip(le.classes_, le.transform(le.classes_)):
    print(f"  {original} -> {encoded}")

print("\nSetelah encoding:")
print(df_clean['diagnosis_encoded'].value_counts())

# Drop kolom diagnosis asli, gunakan yang encoded
df_encoded = df_clean.drop('diagnosis', axis=1)
df_encoded = df_encoded.rename(columns={'diagnosis_encoded': 'diagnosis'})

print(f"\nShape setelah encoding: {df_encoded.shape}")


TUGAS 2: Encoding Diagnosis

Sebelum encoding:
diagnosis
B    357
M    212
Name: count, dtype: int64

Verifikasi encoding:
  diagnosis  diagnosis_encoded
0         M                  1
1         M                  1
2         M                  1
3         M                  1
4         M                  1
5         M                  1
6         M                  1
7         M                  1
8         M                  1
9         M                  1

Mapping encoding:
  B -> 0
  M -> 1

Setelah encoding:
diagnosis_encoded
0    357
1    212
Name: count, dtype: int64

Shape setelah encoding: (569, 32)


## Tugas 3: Standarisasi

Lakukan proses standarisasi pada semua kolom yang memiliki nilai numerik.

In [13]:
print("\n" + "="*60)
print("TUGAS 3: Standarisasi")
print("="*60)

# Pisahkan fitur dan target
X = df_encoded.drop('diagnosis', axis=1)
y = df_encoded['diagnosis']

print(f"\nJumlah fitur numerik yang akan distandardisasi: {X.shape[1]}")

print("\nContoh nilai sebelum standarisasi:")
print(X.head())

print("\nStatistik sebelum standarisasi:")
print(X.describe().loc[['mean', 'std']].T.head())

# Standarisasi semua kolom numerik
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Konversi kembali ke DataFrame untuk kemudahan analisis
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

print("\nContoh nilai setelah standarisasi:")
print(X_scaled_df.head())

print("\nStatistik setelah standarisasi (mean ~0, std ~1):")
print(X_scaled_df.describe().loc[['mean', 'std']].T.head())

print("\nVerifikasi standarisasi:")
print(f"  Mean mendekati 0: {np.allclose(X_scaled_df.mean(), 0, atol=1e-10)}")
print(f"  Std mendekati 1: {np.allclose(X_scaled_df.std(), 1, atol=1e-10)}")


TUGAS 3: Standarisasi

Jumlah fitur numerik yang akan distandardisasi: 31

Contoh nilai sebelum standarisasi:
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430 

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


## Tugas 4: Stratified Split

Lakukan proses stratified split data untuk membuat data latih dan data uji dengan rasio 80:20.

In [14]:
print("\n" + "="*60)
print("TUGAS 4: Stratified Split")
print("="*60)

print("RINGKASAN PREPROCESSING")
print(f"1. Dataset dimuat: {df.shape[0]} sampel, {df.shape[1]} kolom")
print(f"2. Kolom 'id' dihapus")
print(f"3. Kolom 'diagnosis' diencode: M=1, B=0")
print(f"4. {X.shape[1]} fitur numerik distandardisasi (mean=0, std=1)")
print(f"5. Data split 80:20 dengan stratified sampling")

# Stratified split dengan rasio 80:20
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTotal sampel: {len(X_scaled_df)}")
print(f"Train set: {len(X_train)} sampel ({len(X_train)/len(X_scaled_df)*100:.1f}%)")
print(f"Test set: {len(X_test)} sampel ({len(X_test)/len(X_scaled_df)*100:.1f}%)")

print("\nDistribusi kelas di dataset asli:")
print(y.value_counts())
print(f"Proporsi: {y.value_counts(normalize=True).values}")

print("\nVerifikasi stratified split:")
print("\nDistribusi kelas di train set")
print(y_train.value_counts())
print(f"Proporsi: {y_train.value_counts(normalize=True).values}")

print("\nDistribusi kelas di test set")
print(y_test.value_counts())
print(f"Proporsi: {y_test.value_counts(normalize=True).values}")

print("\nData siap untuk modeling!")
print(f"   - Train: {len(X_train)} sampel")
print(f"   - Test: {len(X_test)} sampel")


TUGAS 4: Stratified Split
RINGKASAN PREPROCESSING
1. Dataset dimuat: 569 sampel, 33 kolom
2. Kolom 'id' dihapus
3. Kolom 'diagnosis' diencode: M=1, B=0
4. 31 fitur numerik distandardisasi (mean=0, std=1)
5. Data split 80:20 dengan stratified sampling

Total sampel: 569
Train set: 455 sampel (80.0%)
Test set: 114 sampel (20.0%)

Distribusi kelas di dataset asli:
diagnosis
0    357
1    212
Name: count, dtype: int64
Proporsi: [0.62741652 0.37258348]

Verifikasi stratified split:

Distribusi kelas di train set
diagnosis
0    285
1    170
Name: count, dtype: int64
Proporsi: [0.62637363 0.37362637]

Distribusi kelas di test set
diagnosis
0    72
1    42
Name: count, dtype: int64
Proporsi: [0.63157895 0.36842105]

Data siap untuk modeling!
   - Train: 455 sampel
   - Test: 114 sampel
