In [None]:

import pandas as pd
import dagshub
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
import joblib
import os

try:
    dagshub.init(repo_owner='Sidqiamn', repo_name='Eksperimen_SML_Sidqi', mlflow=True)
    print("Dagshub initialization successful.")
except Exception as e:
    print(f"Error initializing Dagshub: {e}")
    raise

os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "Sidqiamn"
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_TRACKING_PASSWORD", "your-secret-access-key")
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://dagshub.com/api/v1/repo-buckets/s3/Sidqiamn"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("MLFLOW_TRACKING_PUBLIC_KEY", "abc289b6e15d5a43a71660b390de5346f8354acc")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("MLFLOW_TRACKING_PASSWORD", "your-secret-access-key")
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

print("MLFLOW_TRACKING_PASSWORD:", os.getenv("MLFLOW_TRACKING_PASSWORD", "Not set"))
print("AWS_SECRET_ACCESS_KEY:", os.getenv("MLFLOW_TRACKING_PASSWORD", "Not set"))

try:
    mlflow.sklearn.autolog()
    print("Autolog activated successfully.")
except Exception as e:
    print(f"Error activating autolog: {e}")
    raise

if not os.path.exists('models'):
    os.makedirs('models')

try:
    data = pd.read_csv('dataset/processed/iris_processed.csv')
    print("Kolom dalam dataset:", data.columns)
except FileNotFoundError:
    print("File 'iris_processed.csv' tidak ditemukan. Pastikan file ada di direktori dataset/processed/.")
    raise

print("Cek nilai NaN di dataset:")
print(data.isna().sum())

if data['Species'].isna().sum() > 0:
    print(f"Terdapat {data['Species'].isna().sum()} nilai NaN di kolom 'Species'.")
    # Hapus baris dengan nilai NaN di kolom 'Species'
    data = data.dropna(subset=['Species'])
    print("Baris dengan NaN di kolom 'Species' telah dihapus.")
else:
    print("Tidak ada nilai NaN di kolom 'Species'.")

if data.drop('Species', axis=1).isna().sum().sum() > 0:
    print("Terdapat nilai NaN di fitur. Mengisi dengan median...")
    data = data.fillna(data.median(numeric_only=True))
else:
    print("Tidak ada nilai NaN di fitur.")

X = data.drop('Species', axis=1)
y = data['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

try:
    with mlflow.start_run() as run:
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_
        for param_name, param_value in best_params.items():
            mlflow.log_param(param_name, param_value)

        y_pred = grid_search.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        report = classification_report(y_test, y_pred)


        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision) 
        mlflow.log_metric("recall", recall)       

    
        with open("classification_report.txt", "w") as f:
            f.write(report)
        mlflow.log_artifact("classification_report.txt")

    
        best_model = grid_search.best_estimator_
        joblib.dump(best_model, 'models/rf_model_sidqi.joblib')
        mlflow.log_artifact('models/rf_model_sidqi.joblib')

        print("Akurasi model:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("\nLaporan Klasifikasi:\n", report)
        print("Model disimpan di models/rf_model_sidqi.joblib")
except Exception as e:
    print(f"Error saat menjalankan MLflow run: {e}")
    raise

Dagshub initialization successful.
MLFLOW_TRACKING_PASSWORD: 04e98d6a158abadac4c2c0a9fc4039a96fb728ab
AWS_SECRET_ACCESS_KEY: 04e98d6a158abadac4c2c0a9fc4039a96fb728ab
Autolog activated successfully.
Kolom dalam dataset: Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
Cek nilai NaN di dataset:
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          4
dtype: int64
Terdapat 4 nilai NaN di kolom 'Species'.
Baris dengan NaN di kolom 'Species' telah dihapus.
Tidak ada nilai NaN di fitur.


2025/05/09 18:30:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


🏃 View run ambitious-panda-382 at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0/runs/cd9dce6cbb054d09abd311ef10c019d7
🧪 View experiment at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0
🏃 View run incongruous-boar-678 at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0/runs/8bfb25d278d54097ae1634c64574e506
🧪 View experiment at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0
🏃 View run industrious-cow-948 at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0/runs/6814b3d69aa2481e9a116f2f75d1bd96
🧪 View experiment at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0
🏃 View run skittish-worm-728 at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0/runs/e15c340ee2b640959a19d1abe532e1c5
🧪 View experiment at: https://dagshub.com/Sidqiamn/Eksperimen_SML_Sidqi.mlflow/#/experiments/0
🏃 View run casual-bee-321 at: https:/