# **SIC7 | Smart Wellness And Study Companion**

Group Name: 20 bulan mei

Group Members:
- Guntara Hambali
- Muh. Rusmin Nurwadin
- Nahiza Hazim Valensi M
- Reza Ahmad Syarif

# **Import Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Algoritma Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Preprocessing & Utilities
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

# Konfigurasi Tampilan
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# **Load Dataset**

In [None]:
# 1. Load Dataset
df = pd.read_csv("../data/train_dummy.csv")

# --- AUTO-FIX: Jika kolom 'Pakaian' belum ada, buat dummy ---
if 'Pakaian' not in df.columns:
    print("‚ö†Ô∏è Kolom 'Pakaian' tidak ditemukan. Menambahkan data dummy...")
    np.random.seed(42)
    df['Pakaian'] = np.random.choice(['Tipis', 'Sedang', 'Tebal'], size=len(df))

# Tentukan Fitur Numerik dan Kategorikal
numeric_features = ['Temperature (C)', 'Humidity (%)', 'Light (Lux)']
categorical_features = ['Pakaian']
target_col = 'Label'

print(f"Jumlah Data: {df.shape[0]} baris, {df.shape[1]} kolom")
display(df.head())

# **Exploratory Data Analysis (EDA)**

In [None]:
plt.figure(figsize=(18, 12))
plt.suptitle("Exploratory Data Analysis (EDA) - Diagnosis Data", fontsize=16)

# 1. CEK MISSING VALUES
plt.subplot(2, 2, 1)
# Visualisasi Heatmap untuk melihat lokasi data kosong
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
missing_count = df.isnull().sum().sum()
plt.title(f"1. Missing Values Heatmap (Total Missing: {missing_count})")
plt.xlabel("Features")
plt.ylabel("Index Data")

# 2. CEK SKEWNESS (Kemiringan Data Numerik)
# Kita ambil salah satu fitur contoh, misal 'Light (Lux)' atau 'Temperature'
plt.subplot(2, 2, 2)
sns.histplot(df['Temperature (C)'], kde=True, color='skyblue')
skew_val = df['Temperature (C)'].skew()
plt.title(f"2. Cek Skewness: Temperature (Skew: {skew_val:.2f})\n(Idealnya -0.5 s/d 0.5)")

# 3. CEK CLASS IMBALANCE (Ketimpangan Label)
plt.subplot(2, 2, 3)
ax = sns.countplot(x=target_col, data=df, palette='pastel')
plt.title("3. Class Imbalance Check")
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + 0.3, p.get_height() + 2))

# 4. CEK KORELASI FITUR (Opsional tapi penting)
plt.subplot(2, 2, 4)
# Hanya korelasi antar numerik
corr = df[numeric_features].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("4. Correlation Matrix")

plt.tight_layout()
plt.show()

print("\n--- Analisis Skewness Lengkap ---")
print(df[numeric_features].skew())
print("\nCatatan:")
print("- Jika Skewness tinggi (>1 atau <-1), kita perlu 'PowerTransformer'.")
print("- Jika Label tidak seimbang, kita perlu 'class_weight=balanced'.")
print("- Jika ada garis kuning di Heatmap (kiri atas), itu missing values yang harus di-impute.")

# **Splitting Data**

In [None]:
X = df[numeric_features + categorical_features]
y = df[target_col]

# Encode Label Target ke Angka (Wajib untuk XGBoost)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"Target Mapping: {label_mapping}")

# Split Data (Stratify wajib untuk imbalance data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Data Train: {X_train.shape}")
print(f"Data Test : {X_test.shape}")

# **Preprocessing Pipeline**

In [None]:
# 1. Solusi Missing Values & Skewness untuk Data Numerik
numeric_transformer = Pipeline(steps=[
    # Step A: Tangani Missing Values (isi dengan median)
    ('imputer', SimpleImputer(strategy='median')),
    
    # Step B: Tangani Skewness (Yeo-Johnson membuat data lebih berdistribusi normal)
    ('yeo_johnson', PowerTransformer(method='yeo-johnson')),
    
    # Step C: Scaling (Penting untuk SVM dan KNN)
    ('scaler', StandardScaler())
])

# 2. Solusi Missing Values & Encoding untuk Data Kategorikal
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Gabungkan
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print("Pipeline Preprocessing Siap!")

# **Modeling and Validation**

In [None]:
# Daftar Model
models = {
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "SVM": SVC(class_weight='balanced', kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5), # KNN tidak punya class_weight, tapi tertolong oleh data scaling
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42)
}

results = []
best_score = 0
best_model_name = ""
best_pipeline = None

print("Mulai Training Model...\n")

for name, model in models.items():
    # Buat Pipeline Utuh
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    # Training
    clf.fit(X_train, y_train)
    
    # Evaluasi
    y_pred = clf.predict(X_test)
    
    # Gunakan F1-Score Weighted karena data Imbalance (Akurasi biasa bisa bias)
    f1 = f1_score(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)
    
    results.append({'Model': name, 'Accuracy': acc, 'F1-Score': f1})
    
    print(f"‚úÖ {name}: F1-Score = {f1:.4f}")
    
    # Simpan model terbaik
    if f1 > best_score:
        best_score = f1
        best_model_name = name
        best_pipeline = clf

print(f"\nüèÜ Model Terbaik: {best_model_name} (F1: {best_score:.4f})")

# **Evaluasi**

In [None]:
# 1. Tabel Perbandingan
results_df = pd.DataFrame(results).sort_values(by='F1-Score', ascending=False)
display(results_df)

# 2. Grafik
plt.figure(figsize=(10, 5))
sns.barplot(x='F1-Score', y='Model', data=results_df, palette='viridis')
plt.title('Perbandingan Model (F1-Score)')
plt.xlim(0, 1.05)
plt.show()

# 3. Detail Model Terbaik
print(f"\n--- Evaluasi Detail Model Terbaik: {best_model_name} ---")
y_pred_best = best_pipeline.predict(X_test)

# Kembalikan ke Label Asli (String)
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred_best)

print(classification_report(y_test_labels, y_pred_labels))

# Confusion Matrix
plt.figure(figsize=(6, 5))
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=le.classes_)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Confusion Matrix ({best_model_name})')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# **Save Model**

In [None]:
filename = "best_model_sic7.pkl"

artifacts = {
    'model': best_pipeline,
    'classes': le.classes_,
    'model_name': best_model_name,
    'metrics': results_df.iloc[0].to_dict()
}

joblib.dump(artifacts, filename)
print(f"Model berhasil disimpan ke {filename}")