## 1. ML flow settings

- pip install mlflow
- mlflow ui

## 2. Model Load

In [1]:
# 라이브러리 import
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import mlflow
import mlflow.sklearn

ModuleNotFoundError: No module named 'sklearn'

In [55]:
iris = load_iris() # 꽃 받침과 꽃 잎 사이즈를 가지고 꽃의 종류를 결정

X = iris.data
y = iris.target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 학습 데이터와 테스트 데이터로 분리 => train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=123)

In [56]:
# 통계 - 데이터가 표준분포를 따른다는 가정하에
# 여러분들이 데이터 분석을 열심히해서 => 팀장님께 전달드렸어요.
# 팀장님이 질문을 합니다. => 좋은데... 이 분석결과를 어떻게 믿을 수 있는데? => 통계적인 데이터 
# ex) A후보가 오차범위 +- 3% 내 당선 유력합니다.

# 강남구 평균 재산 조사를 해봤더니 30억 // 이재용 회장님이 껴있는거야. (100조) 
# - 전처리 => outlier 이상치를 제거, 데이터 분포를 보고, 평균과 표준편차를 확인하고, 왜도-첨도.

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=0)
model.fit(X_train, y_train) # train=모의고사 # 학습을 시킬 때는 학습 데이터만 제공

y_pred = model.predict(X_test) # 수능 문제를 제공
accuracy = accuracy_score(y_test, y_pred)

print(f"정확도 : {accuracy * 100}")

정확도 : 93.33333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 모델 학습과 모델 성능 

- 심플하게 모든 것을 ML flow에게 맡긴다. => mlflow.autolog()
- autolog에서 추적하지 못하는 다른 파라미터, 메트릭, 메타데이터 등등의 값을 수동으로 기록

In [58]:
mlflow.set_tracking_uri("http://127.0.0.1:5000") # dev.fastcampus.com:5000
print("Tracking URI : ", mlflow.get_tracking_uri())

Tracking URI :  http://127.0.0.1:5000


In [59]:
exp = mlflow.set_experiment(experiment_name='iris_classification_experiments')

print(f"Name: {exp.name}")
print(f"ID: {exp.experiment_id}")
print(f"Location: {exp.artifact_location}")
print(f"Tags: {exp.tags}")
print(f"Lifecycle: {exp.lifecycle_stage}")
print(f"Create Timestamp: {exp.creation_time}")

2024/08/14 15:47:30 INFO mlflow.tracking.fluent: Experiment with name 'iris_classification_experiments' does not exist. Creating a new experiment.


Name: iris_classification_experiments
ID: 273063112817362178
Location: mlflow-artifacts:/273063112817362178
Tags: {}
Lifecycle: active
Create Timestamp: 1723618050874


In [60]:
# Create Timestamp: 1723613497264

import time
time.time()

1723618050.889921

In [61]:
import mlflow.sklearn

mlflow.autolog()

mlflow.start_run(nested=True) # 실험 시작
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train) # train=모의고사 # 학습을 시킬 때는 학습 데이터만 제공

y_pred = model.predict(X_test) # 수능 문제를 제공
accuracy = accuracy_score(y_test, y_pred)

print(f"정확도 : {accuracy * 100}")

mlflow.end_run() # 실험 종료

2024/08/14 15:47:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run respected-moose-134 at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/c5d40042030b4c16b222c8721b1e6178.
2024/08/14 15:47:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


정확도 : 96.66666666666667


In [62]:
import mlflow.sklearn

mlflow.autolog()

# with, end 구문을 붙이지 않아도 알아서 실험 종료가 됩니다.
with mlflow.start_run(nested=True): # 실험 시작
    model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train) # train=모의고사 # 학습을 시킬 때는 학습 데이터만 제공

    y_pred = model.predict(X_test) # 수능 문제를 제공
    accuracy = accuracy_score(y_test, y_pred)

    print(f"정확도 : {accuracy * 100}")

2024/08/14 15:47:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-shark-891 at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/ed5afe209faf48fe865ad3f316e8a3ab.
2024/08/14 15:47:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


정확도 : 96.66666666666667


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "LogisticRegression" : LogisticRegression(
        max_iter=200, # 최대 반복 횟수
        C=1.0, # 규제 강도(C값이 작을수록 규제가 강해짐)
        solver='lbfgs', # 최적화 알고리즘
        random_state=123
    ),
    "RandomForest" : RandomForestClassifier(
        n_estimators=100, # 트리의 갯수
        max_depth=None,
        random_state=123
    ),
    "SVC" : SVC(
        kernel='linear', # linear, sigmoid, poly, rbf
        random_state=123
    )
}

In [64]:
# 위 모델들을 한번에 불러와서(반복문) => 최고의 모델을 찾아내고, 해당 파라미터를 기록합니다.

mlflow.autolog()

best_accuracy = 0
best_model = None
best_model_name = None

with mlflow.start_run(nested=True):
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model_name = model_name
            best_model = model

        print(f"Model Name: {model_name}, Accuracy: {accuracy}")

        mlflow.log_param('best_model', best_model_name) # 파라미터 로그
        mlflow.log_metric('best_accuracy', best_accuracy) # 메트릭 로그

    print(f"Best Model Name: {best_model_name}, Best Accuracy: {best_accuracy}")

Model Name: LogisticRegression, Accuracy: 0.9666666666666667


2024/08/14 15:47:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run calm-robin-731 at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/e49366bcf52d45bb94dd04d1b8a301fd.
2024/08/14 15:47:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


Model Name: RandomForest, Accuracy: 0.9333333333333333
Model Name: SVC, Accuracy: 0.9333333333333333
Best Model Name: LogisticRegression, Best Accuracy: 0.9666666666666667


In [65]:
mlflow.autolog()

# 전체 모델에 대해서 기록을 하고 싶은데?
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name, nested=True):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # 모델을 mlflow에 저장
        model_path = f"{model_name}_model"
        mlflow.sklearn.log_model(model, model_path) # 모델을 artifact 디렉토리에 저장

        mlflow.log_param(f'{model_name}_param', model.get_params()) # 파라미터 로그
        mlflow.log_metric(f'{model_name}_accuracy', accuracy) # 메트릭 로그

        print(f"Model Name: {model_name}, Accuracy: {accuracy}")

2024/08/14 15:47:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/c43fcd5ca3e1413cbcd802d622f591e6.
2024/08/14 15:47:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


Model Name: LogisticRegression, Accuracy: 0.9666666666666667


2024/08/14 15:47:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/b51b64586c584bc687437f1d82701270.
2024/08/14 15:47:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


Model Name: RandomForest, Accuracy: 0.9333333333333333


2024/08/14 15:47:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC at: http://127.0.0.1:5000/#/experiments/273063112817362178/runs/3ca2a2a4a7dd46dfa7a6068a5ca1d06d.
2024/08/14 15:47:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/273063112817362178.


Model Name: SVC, Accuracy: 0.9333333333333333


In [66]:
# 제 전략
# - 회사에서 전략: 3년 이내 억대 연봉 => ??? => 2~3인분 하는거죠.
# - 내 업무 + 그 외 영역까지 함께.
# - Android SDK + Web SDK + Flutter SDK + React SDK ... 
# - 빠르게 억대 연봉을 찍고 그 다음 스텝으로 넘어가고 싶었어요. => 스톡옵션을 받을 수 있습니다.

In [71]:
# 모델 관리
from mlflow.tracking import MlflowClient

client = MlflowClient()


# 모델을 등록하고, 해당 모델의 버전을 반환
def register_model(model_name, run_id, model_uri='model'): # 모델 등록
    model_uri = f"runs:/{run_id}/{model_uri}"
    model_version = mlflow.register_model(model_uri, model_name)
    return model_version

# 등록된 모델을 stage 단계로 승격
def promote_to_staging(model_name, run_id, model_uri): # stage
    model_version = register_model(model_name, run_id, model_uri)

    client.set_model_version_tag(
        name=model_name,
        version=model_version.version,
        key='stage',
        value='staging'
    )
    print(f"Model: {model_name}, version: {model_version} promoted to Staging...")

def promote_to_production(model_name, version): # production
    client.set_model_version_tag(
        name=model_name,
        version=version,
        key='stage',
        value='production'
    )

    print(f"Model: {model_name}, version: {version} promoted to Production...")


def archive_model(model_name, version): # archive: 모델 폐기 단계
    client.set_model_version_tag(
        name=model_name,
        version=version,
        key='stage',
        value='archived'
    )
    
    print(f"Model: {model_name}, version: {version} Archived ...")

부트캠프: 짧은 시간 내에 실무지식 익혀서 성과(포트폴리오)
- 포트폴리오
- 기록: 너 공부 얼마나 열심히했어? -> 제 블로그 보시면 압니다.. 새벽에 감수성 있는 글도 작성.
- 일기장: 눈물자국.

이악깡 - 악으로 깡으로

In [68]:
# http://127.0.0.1:5000/#/experiments/273063112817362178/runs/c43fcd5ca3e1413cbcd802d622f591e6
# 실험ID: 273063112817362178
# 실행ID: c43fcd5ca3e1413cbcd802d622f591e6
# Model Name: LogisticRegression

# (1) 모델 등록
run_id = 'c43fcd5ca3e1413cbcd802d622f591e6'
model_name = 'LogisticRegression'

model_version = register_model(model_name, run_id)
print(model_version)

Successfully registered model 'LogisticRegression'.
2024/08/14 16:40:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 1


<ModelVersion: aliases=[], creation_timestamp=1723621249544, current_stage='None', description='', last_updated_timestamp=1723621249544, name='LogisticRegression', run_id='c43fcd5ca3e1413cbcd802d622f591e6', run_link='', source='mlflow-artifacts:/273063112817362178/c43fcd5ca3e1413cbcd802d622f591e6/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>


Created version '1' of model 'LogisticRegression'.


In [73]:
# (2) 모델을 staging 단계로 승격
promote_to_staging(model_name, run_id, 'model')

Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2024/08/14 16:48:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 4


Model: LogisticRegression, version: <ModelVersion: aliases=[], creation_timestamp=1723621682358, current_stage='None', description='', last_updated_timestamp=1723621682358, name='LogisticRegression', run_id='c43fcd5ca3e1413cbcd802d622f591e6', run_link='', source='mlflow-artifacts:/273063112817362178/c43fcd5ca3e1413cbcd802d622f591e6/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='4'> promoted to Staging...


Created version '4' of model 'LogisticRegression'.


In [74]:
# (3) 모델을 Production 단계로 승격
promote_to_production(model_name, '3')

Model: LogisticRegression, version: 3 promoted to Production...


In [75]:
# (4) 새로운 버전의 모델을 Production으로 승격시키고, 기존의 Production 버전은 Archived
promote_to_production(model_name, '4') # 4 staging -> production
archive_model(model_name, '3') # production -> archive

Model: LogisticRegression, version: 4 promoted to Production...
Model: LogisticRegression, version: 3 Archived ...


### 모델 Serving

- FastAPI, Flask ... => API로 언제만들지...?????
- mlflow가 해결을 해줍니다.
- inference: 값을 전달하고, 그 값에 대한 예측값을 return (API)