## 1. ML flow settings

- pip install mlflow
- mlflow ui

## 2. Model Load

In [1]:
# 라이브러리 import
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import mlflow
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

from mlflow.tracking import MlflowClient

In [2]:
iris = load_iris() # 꽃 받침과 꽃 잎 사이즈를 가지고 꽃의 종류를 결정

X = iris.data
y = iris.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 학습 데이터와 테스트 데이터로 분리 => train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=123)

### 모델 학습과 모델 성능 

- 심플하게 모든 것을 ML flow에게 맡긴다. => mlflow.autolog()
- autolog에서 추적하지 못하는 다른 파라미터, 메트릭, 메타데이터 등등의 값을 수동으로 기록

In [3]:
#mlflow.set_tracking_uri("http://127.0.0.1:5001")
mlflow.set_tracking_uri("http://10.103.73.87:8080") # dev.fastcampus.com:5000
print("Tracking URI : ", mlflow.get_tracking_uri())

Tracking URI :  http://10.103.73.87:8080


In [4]:
exp = mlflow.set_experiment(experiment_name='iris_classification_experiments_s3_test')

print(f"Name: {exp.name}")
print(f"ID: {exp.experiment_id}")
print(f"Location: {exp.artifact_location}")
print(f"Tags: {exp.tags}")
print(f"Lifecycle: {exp.lifecycle_stage}")
print(f"Create Timestamp: {exp.creation_time}")

2024/09/12 18:10:00 INFO mlflow.tracking.fluent: Experiment with name 'iris_classification_experiments_s3_test' does not exist. Creating a new experiment.


Name: iris_classification_experiments_s3_test
ID: 1
Location: s3://team06-mlflow-feature/1
Tags: {}
Lifecycle: active
Create Timestamp: 1726132200727


In [5]:
models = {
    "LogisticRegression" : LogisticRegression(
        max_iter=200, # 최대 반복 횟수
        C=0.5, # 규제 강도(C값이 작을수록 규제가 강해짐)
        solver='lbfgs', # 최적화 알고리즘
        random_state=123
    ),
    "RandomForest" : RandomForestClassifier(
        n_estimators=100, # 트리의 갯수
        max_depth=None,
        random_state=123
    ),
    "SVC" : SVC(
        kernel='linear', # linear, sigmoid, poly, rbf
        random_state=123
    )
}

In [6]:

mlflow.autolog()

best_accuracy = 0
best_model = None
best_model_name = None

# 전체 모델에 대해서 기록을 하고 싶은데?
for model_name, model in models.items():
    print(model_name)
    with mlflow.start_run(run_name=model_name, nested=True) as run:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = model_name
            best_model = model
            best_run_id = run.info.run_id  # 현재 run의 ID 저장
            best_experiment_id = run.info.experiment_id  # 현재 experiment ID 저장

        # 모델을 mlflow에 저장
        model_path = f"{model_name}_model"
        mlflow.sklearn.log_model(model, model_path) # 모델을 artifact 디렉토리에 저장

        mlflow.log_param(f'{model_name}_param', model.get_params()) # 파라미터 로그
        mlflow.log_metric(f'{model_name}_accuracy', accuracy) # 메트릭 로그

        print(f"Model Name: {model_name}, Accuracy: {accuracy}")


print()
print()
print()
print(best_run_id)
print(best_experiment_id)

2024/09/12 18:10:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


LogisticRegression


2024/09/12 18:10:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: http://10.103.73.87:8080/#/experiments/1/runs/96c44a126d2040c9a9b1a8361eb2f49d.
2024/09/12 18:10:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://10.103.73.87:8080/#/experiments/1.


Model Name: LogisticRegression, Accuracy: 0.9666666666666667
RandomForest


2024/09/12 18:10:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: http://10.103.73.87:8080/#/experiments/1/runs/4fad06b9de91403f93b6961fb6121b8e.
2024/09/12 18:10:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://10.103.73.87:8080/#/experiments/1.


Model Name: RandomForest, Accuracy: 0.9333333333333333
SVC


2024/09/12 18:10:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC at: http://10.103.73.87:8080/#/experiments/1/runs/e24f73e47d0b4e59886c2398168a091e.
2024/09/12 18:10:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://10.103.73.87:8080/#/experiments/1.


Model Name: SVC, Accuracy: 0.9333333333333333



96c44a126d2040c9a9b1a8361eb2f49d
1


In [7]:
# 모델 관리


client = MlflowClient()


# 모델을 등록하고, 해당 모델의 버전을 반환
def register_model(model_name, run_id, accuracy, model_uri='model'): # 모델 등록
    model_uri = f"runs:/{run_id}/{model_uri}"
    model_version = mlflow.register_model(model_uri, model_name, tags = {'stage':'staging', 'accuracy':f"{accuracy:0.5f}"})
    return model_version
'''
# 등록된 모델을 stage 단계로 승격
def promote_to_staging(model_name, run_id, model_uri): # stage
    model_version = register_model(model_name, run_id, model_uri)

    client.set_model_version_tag(
        name=model_name,
        version=model_version.version,
        key='stage',
        value='staging',
        key='accuracy',
        value = accuracy
        
    )
    print(f"Model: {model_name}, version: {model_version} promoted to Staging...")
'''
def promote_to_production(model_name, version): # production
    client.set_model_version_tag(
        name=model_name,
        version=version,
        key='stage',
        value='production'
    )

    print(f"Model: {model_name}, version: {version} promoted to Production...")


def archive_model(model_name, version): # archive: 모델 폐기 단계
    client.set_model_version_tag(
        name=model_name,
        version=version,
        key='stage',
        value='archived'
    )
    
    print(f"Model: {model_name}, version: {version} Archived ...")

In [8]:
# http://127.0.0.1:5000/#/experiments/273063112817362178/runs/c43fcd5ca3e1413cbcd802d622f591e6
# 실험ID: 273063112817362178
# 실행ID: c43fcd5ca3e1413cbcd802d622f591e6
# Model Name: LogisticRegression

# (1) 모델 등록
run_id = best_run_id
model_name = best_model_name
accuracy = best_accuracy

model_version = register_model(model_name, run_id, accuracy)
print(model_version)

Successfully registered model 'LogisticRegression'.
2024/09/12 18:10:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 1


<ModelVersion: aliases=[], creation_timestamp=1726132250035, current_stage='None', description='', last_updated_timestamp=1726132250035, name='LogisticRegression', run_id='96c44a126d2040c9a9b1a8361eb2f49d', run_link='', source='s3://team06-mlflow-feature/1/96c44a126d2040c9a9b1a8361eb2f49d/artifacts/model', status='READY', status_message='', tags={'accuracy': '0.96667', 'stage': 'staging'}, user_id='', version='1'>


Created version '1' of model 'LogisticRegression'.


In [21]:
# (2) 모델을 staging 단계로 승격
promote_to_staging(model_name, run_id, 'model')

NameError: name 'promote_to_staging' is not defined

In [70]:
# (3) 모델을 Production 단계로 승격
promote_to_production(model_name, '18')

Model: LogisticRegression, version: 18 promoted to Production...


In [71]:
# (4) 새로운 버전의 모델을 Production으로 승격시키고, 기존의 Production 버전은 Archived
#promote_to_production(model_name, '3') # 4 staging -> production
archive_model(model_name, '18') # production -> archive

Model: LogisticRegression, version: 18 Archived ...


### 모델 Serving

- FastAPI, Flask ... => API로 언제만들지...?????
- mlflow가 해결을 해줍니다.
- inference: 값을 전달하고, 그 값에 대한 예측값을 return (API)