# **1. Perkenalan Dataset**


Tahap pertama, Anda harus mencari dan menggunakan dataset dengan ketentuan sebagai berikut:

1. **Sumber Dataset**:  
   Dataset dapat diperoleh dari berbagai sumber, seperti public repositories (*Kaggle*, *UCI ML Repository*, *Open Data*) atau data primer yang Anda kumpulkan sendiri.


# **2. Import Library**

Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning atau deep learning.

In [None]:
# Import library untuk data manipulation dan analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import library untuk preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Import library untuk modeling
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Import library untuk evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Import library untuk saving model
import joblib
import pickle
import os

# Set style untuk visualisasi
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… All libraries imported successfully!")

# **3. Memuat Dataset**

Pada tahap ini, Anda perlu memuat dataset ke dalam notebook. Jika dataset dalam format CSV, Anda bisa menggunakan pustaka pandas untuk membacanya. Pastikan untuk mengecek beberapa baris awal dataset untuk memahami strukturnya dan memastikan data telah dimuat dengan benar.

Jika dataset berada di Google Drive, pastikan Anda menghubungkan Google Drive ke Colab terlebih dahulu. Setelah dataset berhasil dimuat, langkah berikutnya adalah memeriksa kesesuaian data dan siap untuk dianalisis lebih lanjut.

Jika dataset berupa unstructured data, silakan sesuaikan dengan format seperti kelas Machine Learning Pengembangan atau Machine Learning Terapan

In [None]:
# Download dataset Wine Quality dari UCI ML Repository
# Dataset ini berisi informasi tentang kualitas wine berdasarkan karakteristik fisikokimia

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url, sep=';')

# Simpan dataset raw untuk backup
os.makedirs('data', exist_ok=True)
df.to_csv('data/winequality_raw.csv', index=False)

print("Dataset Shape:", df.shape)
print("\n" + "="*50)
print("Dataset Info:")
print("="*50)
print(df.info())
print("\n" + "="*50)
print("First 5 rows:")
print("="*50)
display(df.head())
print("\nâœ… Dataset loaded successfully!")

# **4. Exploratory Data Analysis (EDA)**

Pada tahap ini, Anda akan melakukan **Exploratory Data Analysis (EDA)** untuk memahami karakteristik dataset.

Tujuan dari EDA adalah untuk memperoleh wawasan awal yang mendalam mengenai data dan menentukan langkah selanjutnya dalam analisis atau pemodelan.

In [None]:
# ========================================
# 4.1 Informasi Dasar Dataset
# ========================================
print("="*60)
print("INFORMASI DASAR DATASET")
print("="*60)
print(f"Jumlah Baris: {df.shape[0]}")
print(f"Jumlah Kolom: {df.shape[1]}")
print(f"\nNama Kolom:\n{df.columns.tolist()}")
print(f"\nTipe Data:\n{df.dtypes}")

# ========================================
# 4.2 Statistik Deskriptif
# ========================================
print("\n" + "="*60)
print("STATISTIK DESKRIPTIF")
print("="*60)
display(df.describe())

# ========================================
# 4.3 Cek Missing Values
# ========================================
print("\n" + "="*60)
print("MISSING VALUES")
print("="*60)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0])
if missing_df['Missing Count'].sum() == 0:
    print("âœ… Tidak ada missing values!")

# ========================================
# 4.4 Cek Duplikasi Data
# ========================================
print("\n" + "="*60)
print("DUPLIKASI DATA")
print("="*60)
duplicates = df.duplicated().sum()
print(f"Jumlah data duplikat: {duplicates}")
print(f"Persentase duplikasi: {(duplicates/len(df)*100):.2f}%")

# ========================================
# 4.5 Distribusi Target Variable (Quality)
# ========================================
print("\n" + "="*60)
print("DISTRIBUSI TARGET VARIABLE (QUALITY)")
print("="*60)
print(df['quality'].value_counts().sort_index())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df['quality'].value_counts().sort_index().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribusi Kualitas Wine', fontsize=14, fontweight='bold')
plt.xlabel('Quality Score')
plt.ylabel('Frequency')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['quality'].value_counts().sort_index().plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Proporsi Kualitas Wine', fontsize=14, fontweight='bold')
plt.ylabel('')
plt.tight_layout()
plt.show()

# ========================================
# 4.6 Distribusi Fitur Numerik
# ========================================
print("\n" + "="*60)
print("DISTRIBUSI FITUR NUMERIK")
print("="*60)

# Visualisasi histogram untuk semua fitur
df.hist(bins=30, figsize=(20, 15), edgecolor='black', color='lightblue')
plt.suptitle('Distribusi Semua Fitur', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

# ========================================
# 4.7 Correlation Matrix
# ========================================
print("\n" + "="*60)
print("CORRELATION MATRIX")
print("="*60)

plt.figure(figsize=(14, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Wine Quality Dataset', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Korelasi dengan target variable
print("\nKorelasi dengan Quality (diurutkan):")
quality_corr = correlation_matrix['quality'].sort_values(ascending=False)
print(quality_corr)

# ========================================
# 4.8 Outlier Detection dengan Boxplot
# ========================================
print("\n" + "="*60)
print("OUTLIER DETECTION")
print("="*60)

features = df.columns[:-1]  # Semua kolom kecuali quality
fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.ravel()

for idx, col in enumerate(features):
    axes[idx].boxplot(df[col], vert=True, patch_artist=True,
                     boxprops=dict(facecolor='lightblue', color='black'),
                     medianprops=dict(color='red', linewidth=2))
    axes[idx].set_title(f'{col}', fontweight='bold')
    axes[idx].set_ylabel('Value')
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Boxplot untuk Deteksi Outlier', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# ========================================
# 4.9 Relationship antara Fitur dengan Quality
# ========================================
print("\n" + "="*60)
print("RELATIONSHIP FITUR DENGAN QUALITY")
print("="*60)

# Pilih 4 fitur dengan korelasi tertinggi
top_features = quality_corr[1:5].index.tolist()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, feature in enumerate(top_features):
    axes[idx].scatter(df[feature], df['quality'], alpha=0.5, c=df['quality'], 
                     cmap='viridis', edgecolors='black', linewidth=0.5)
    axes[idx].set_xlabel(feature, fontweight='bold')
    axes[idx].set_ylabel('Quality', fontweight='bold')
    axes[idx].set_title(f'{feature} vs Quality\n(Correlation: {quality_corr[feature]:.3f})', 
                       fontweight='bold')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nâœ… Exploratory Data Analysis completed!")

# **5. Data Preprocessing**

Pada tahap ini, data preprocessing adalah langkah penting untuk memastikan kualitas data sebelum digunakan dalam model machine learning.

Jika Anda menggunakan data teks, data mentah sering kali mengandung nilai kosong, duplikasi, atau rentang nilai yang tidak konsisten, yang dapat memengaruhi kinerja model. Oleh karena itu, proses ini bertujuan untuk membersihkan dan mempersiapkan data agar analisis berjalan optimal.

Berikut adalah tahapan-tahapan yang bisa dilakukan, tetapi **tidak terbatas** pada:
1. Menghapus atau Menangani Data Kosong (Missing Values)
2. Menghapus Data Duplikat
3. Normalisasi atau Standarisasi Fitur
4. Deteksi dan Penanganan Outlier
5. Encoding Data Kategorikal
6. Binning (Pengelompokan Data)

Cukup sesuaikan dengan karakteristik data yang kamu gunakan yah. Khususnya ketika kami menggunakan data tidak terstruktur.

In [None]:
# ========================================
# 5.1 Handling Duplicate Data
# ========================================
print("="*60)
print("STEP 1: HANDLING DUPLICATE DATA")
print("="*60)
print(f"Jumlah data sebelum menghapus duplikat: {len(df)}")
df_clean = df.drop_duplicates()
print(f"Jumlah data setelah menghapus duplikat: {len(df_clean)}")
print(f"Jumlah duplikat yang dihapus: {len(df) - len(df_clean)}")

# ========================================
# 5.2 Feature Engineering - Binning Quality
# ========================================
print("\n" + "="*60)
print("STEP 2: FEATURE ENGINEERING - BINNING QUALITY")
print("="*60)
# Konversi quality menjadi 3 kategori: Low (3-5), Medium (6), High (7-8)
def categorize_quality(quality):
    if quality <= 5:
        return 0  # Low
    elif quality == 6:
        return 1  # Medium
    else:
        return 2  # High

df_clean['quality_category'] = df_clean['quality'].apply(categorize_quality)

print("Distribusi Quality Category:")
print(df_clean['quality_category'].value_counts().sort_index())
print("\nMapping:")
print("0 = Low Quality (3-5)")
print("1 = Medium Quality (6)")
print("2 = High Quality (7-8)")

# Visualisasi
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df_clean['quality'].hist(bins=6, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribusi Quality (Original)', fontweight='bold')
plt.xlabel('Quality Score')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
df_clean['quality_category'].value_counts().sort_index().plot(kind='bar', color='coral', edgecolor='black')
plt.title('Distribusi Quality Category (Binned)', fontweight='bold')
plt.xlabel('Category (0=Low, 1=Medium, 2=High)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# ========================================
# 5.3 Handling Outliers menggunakan IQR Method
# ========================================
print("\n" + "="*60)
print("STEP 3: HANDLING OUTLIERS (IQR METHOD)")
print("="*60)

def remove_outliers_iqr(data, columns):
    df_no_outliers = data.copy()
    outliers_removed = {}
    
    for col in columns:
        Q1 = df_no_outliers[col].quantile(0.25)
        Q3 = df_no_outliers[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        before = len(df_no_outliers)
        df_no_outliers = df_no_outliers[
            (df_no_outliers[col] >= lower_bound) & 
            (df_no_outliers[col] <= upper_bound)
        ]
        after = len(df_no_outliers)
        outliers_removed[col] = before - after
    
    return df_no_outliers, outliers_removed

# Kolom yang akan dibersihkan dari outlier
numeric_cols = df_clean.columns[:-2].tolist()  # Exclude quality dan quality_category

print(f"Jumlah data sebelum menghapus outlier: {len(df_clean)}")
df_no_outliers, outliers_info = remove_outliers_iqr(df_clean, numeric_cols)
print(f"Jumlah data setelah menghapus outlier: {len(df_no_outliers)}")
print(f"\nOutliers dihapus per kolom:")
for col, count in outliers_info.items():
    if count > 0:
        print(f"  - {col}: {count} outliers")

# ========================================
# 5.4 Feature Scaling - Standardization
# ========================================
print("\n" + "="*60)
print("STEP 4: FEATURE SCALING (STANDARDIZATION)")
print("="*60)

# Pisahkan features dan target
X = df_no_outliers.drop(['quality', 'quality_category'], axis=1)
y = df_no_outliers['quality_category']

print(f"Shape of Features (X): {X.shape}")
print(f"Shape of Target (y): {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts().sort_index()}")

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

print("\nâœ… Features scaled successfully!")
print("\nStatistik setelah scaling:")
display(X_scaled_df.describe())

# Visualisasi perbandingan sebelum dan sesudah scaling
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Before scaling
axes[0].boxplot([X[col] for col in X.columns], labels=X.columns, patch_artist=True,
                boxprops=dict(facecolor='lightblue'))
axes[0].set_title('Before Scaling', fontweight='bold', fontsize=14)
axes[0].set_ylabel('Value')
axes[0].tick_params(axis='x', rotation=90)
axes[0].grid(True, alpha=0.3)

# After scaling
axes[1].boxplot([X_scaled_df[col] for col in X_scaled_df.columns], labels=X_scaled_df.columns, 
                patch_artist=True, boxprops=dict(facecolor='lightgreen'))
axes[1].set_title('After Scaling', fontweight='bold', fontsize=14)
axes[1].set_ylabel('Standardized Value')
axes[1].tick_params(axis='x', rotation=90)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ========================================
# 5.5 Train-Test Split
# ========================================
print("\n" + "="*60)
print("STEP 5: TRAIN-TEST SPLIT")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTarget distribution in training set:\n{y_train.value_counts().sort_index()}")
print(f"\nTarget distribution in testing set:\n{y_test.value_counts().sort_index()}")

# ========================================
# 5.6 Save Preprocessed Data
# ========================================
print("\n" + "="*60)
print("STEP 6: SAVE PREPROCESSED DATA")
print("="*60)

os.makedirs('data/preprocessed', exist_ok=True)

# Gabungkan kembali X dan y untuk disimpan
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('data/preprocessed/train_data.csv', index=False)
test_data.to_csv('data/preprocessed/test_data.csv', index=False)

# Simpan scaler
joblib.dump(scaler, 'data/preprocessed/scaler.pkl')

print("âœ… Preprocessed data saved successfully!")
print(f"  - Train data: data/preprocessed/train_data.csv")
print(f"  - Test data: data/preprocessed/test_data.csv")
print(f"  - Scaler: data/preprocessed/scaler.pkl")

# ========================================
# 5.7 Summary
# ========================================
print("\n" + "="*60)
print("PREPROCESSING SUMMARY")
print("="*60)
print(f"âœ… Original data: {len(df)} samples")
print(f"âœ… After removing duplicates: {len(df_clean)} samples")
print(f"âœ… After removing outliers: {len(df_no_outliers)} samples")
print(f"âœ… Training samples: {len(X_train)}")
print(f"âœ… Testing samples: {len(X_test)}")
print(f"âœ… Number of features: {X_train.shape[1]}")
print(f"âœ… Number of classes: {len(y.unique())}")
print("\nðŸŽ‰ Data Preprocessing Completed Successfully!")