In [148]:
import os
import mlflow
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
df = pd.read_csv("../data/processed_data/survey_lung_cancer_clean.csv")

In [149]:
mlflow.sklearn.autolog(disable=True) # Отключаем автологирование для избежания конфликтов
os.environ['USER'] = "Ulugbek Tursunkulov"

In [150]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Lung_cancer")

<Experiment: artifact_location='file:///C:/Users/Asus/Desktop/lung_cancer/reports/ml_flow_service/artefacts/890502249313353174', creation_time=1760192483745, experiment_id='890502249313353174', last_update_time=1760192483745, lifecycle_stage='active', name='Lung_cancer', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [151]:
import time
from importlib.metadata import version
import seaborn as sns
import matplotlib.pyplot as plt
from mlflow.models import infer_signature
from sklearn.metrics import roc_auc_score, make_scorer, f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

def run_experiment(model_name, model_class, run_name, 
                   grid_param, x_tr, y_tr, x_vl, y_vl, x_te, y_te, scaler=False, mix=False, 
                   register_model=True, model_registry_name=None):
    with mlflow.start_run(run_name=run_name):
        
        mlflow.log_param("sklearn_version", version("scikit-learn"))
        mlflow.log_param("timestamp", time.strftime("%Y-%m-%d %H:%M:%S"))
        
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("train_size", len(x_tr))
        mlflow.log_param("valid_size", len(x_vl))
        mlflow.log_param("test_size", len(x_te))
        
        steps = []
        if scaler:
            steps.append(('scaler', StandardScaler()))
            
        steps.append(('model', model_class()))
        pipeline = Pipeline(steps)
        
        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=grid_param,
            scoring=make_scorer(f1_score, average='macro'),
            cv=5,
            n_jobs=-1,
            verbose=1
        )
        
        grid.fit(x_tr, y_tr)
        best_model = grid.best_estimator_
        mlflow.log_params(grid.best_params_)
        
        #TODO: Validation
        print("Validation of model...")
        y_valid_pred = best_model.predict(x_vl)
        y_valid_prob = best_model.predict_proba(x_vl)[:, 1]
        
        metrics_valid = {
            "accuracy_valid": accuracy_score(y_vl, y_valid_pred),
            "precision_valid": precision_score(y_vl, y_valid_pred, average="macro"),
            "recall_valid": recall_score(y_vl, y_valid_pred, average="macro"),
            "f1_score_valid": f1_score(y_vl, y_valid_pred, average="macro"),
            "roc_auc_valid": roc_auc_score(y_vl, y_valid_prob)
        }
        print("=== Validation Metrics ===")
        print(classification_report(y_vl, y_valid_pred))
        mlflow.log_metrics(metrics_valid)
         
        _save_confusion_matrix(y_vl, y_valid_pred, run_name, "valid")
        _save_roc_curve(y_vl, y_valid_prob, run_name, "valid")
        _save_precision_recall_curve(y_vl, y_valid_prob, run_name, "valid")
        
        #TODO: Объединение Train + Valid
        
        if mix:
            print("Переобучение на объединенных данных (train + valid)...")
            x_train_full = np.vstack([x_tr, x_vl])
            y_train_full = np.concatenate([y_tr, y_vl])
            # Создаем новую модель с теми же параметрами
            last_model = Pipeline(steps)
            last_model.set_params(**grid.best_params_)
            last_model.fit(x_train_full, y_train_full)

            mlflow.log_param("train_used", "train+valid")
        else:
            last_model = best_model  # если не объединяем, используем модель как есть
            mlflow.log_param("train_used", "train_only")
        
        #TODO Final Test
        print("Тестирование финальной модели...")
        y_test_pred = last_model.predict(x_te)
        y_test_prob = last_model.predict_proba(x_te)[:, 1]
        
        metrics_test = {
            "accuracy_test": accuracy_score(y_te, y_test_pred),
            "precision_test": precision_score(y_te, y_test_pred, average="macro"),
            "recall_test": recall_score(y_te, y_test_pred, average="macro"),
            "f1_score_test": f1_score(y_te, y_test_pred, average="macro"),
            "roc_auc_test": roc_auc_score(y_te, y_test_prob)
        }
        print("=== Test Metrics ===")
        print(classification_report(y_te, y_test_pred))
        mlflow.log_metrics(metrics_test)
        
        signature = infer_signature(x_tr, best_model.predict(x_tr))
        mlflow.sklearn.log_model(
        sk_model=last_model,
        name="model",
        signature=signature,
        input_example=x_tr[:5],  # Sample input for documentation
        )
        
        if register_model:
            try:
                # Получаем текущий run_id
                run_id = mlflow.active_run().info.run_id
                
                # Определяем имя модели в реестре
                if model_registry_name is None:
                    model_registry_name = f"{model_name}_LungCancer"
                
                # Регистрируем модель в Model Registry
                model_uri = f"runs:/{run_id}/model"
                model_version = mlflow.register_model(
                    model_uri=model_uri,
                    name=model_registry_name,
                    tags={
                        "model_type": model_name,
                        "experiment_date": time.strftime("%Y-%m-%d"),
                        "data_preprocessing": "scaler" if scaler else "no_scaler",
                        "training_strategy": "train+valid" if mix else "train_only",
                        "f1_score_test": f"{metrics_test['f1_score_test']:.4f}",
                        "roc_auc_test": f"{metrics_test['roc_auc_test']:.4f}"
                    }
                )
                
                print(f"✅ Модель зарегистрирована в Model Registry:")
                print(f"   Имя: {model_registry_name}")
                print(f"   Версия: {model_version.version}")
                print(f"   URI: {model_uri}")
                
                # Логируем информацию о версии модели
                mlflow.log_param("model_registry_name", model_registry_name)
                mlflow.log_param("model_version", model_version.version)
                
                # Автоматически переводим модель в статус "Staging" если метрики хорошие
                if metrics_test['f1_score_test'] > 0.8 and metrics_test['roc_auc_test'] > 0.8:
                    client = mlflow.tracking.MlflowClient()
                    client.transition_model_version_stage(
                        name=model_registry_name,
                        version=model_version.version,
                        stage="Staging"
                    )
                    print(f"🚀 Модель переведена в статус 'Staging' (хорошие метрики)")
                
                return last_model, metrics_valid, metrics_test, model_version
                
            except Exception as e:
                print(f"⚠️ Ошибка при регистрации модели: {e}")
                print("Модель сохранена как артефакт, но не зарегистрирована в реестре")
        
        # Логируем данные
        try:
            mlflow.log_artifact("../data/processed_data/survey_lung_cancer_clean.csv")
        except Exception as e:
            print(f"Предупреждение: не удалось залогировать файл данных: {e}")
            
        print("Эксперимент завершен успешно!")
        return last_model, metrics_valid, metrics_test, None
    

In [None]:
def _save_confusion_matrix(y_true, y_pred, run_name, dataset_type):
        """Сохраняет матрицу ошибок"""
        try:
            ConfusionMatrixDisplay.from_predictions(
                y_true, y_pred, 
                display_labels=["No Cancer", "Cancer"],
                cmap=plt.cm.Blues
            )
            plt.title(f"Confusion Matrix: {run_name} ({dataset_type})")
            cm_path = f"../reports/figures/conf_matrix/conf_matrix_{run_name}_{dataset_type}.png"
            plt.savefig(cm_path, dpi=100, bbox_inches='tight')
            mlflow.log_artifact(cm_path)
            plt.close()
        except Exception as e:
            print(f"Предупреждение: не удалось сохранить матрицу ошибок: {e}")

In [None]:
def _save_roc_curve(y_true, y_prob, run_name, dataset_type):
        """Сохраняет ROC кривую"""
        try:
            RocCurveDisplay.from_predictions(y_true, y_prob)
            plt.title(f"ROC Curve: {run_name} ({dataset_type})")
            roc_path = f"../reports/figures/roc_auc/roc_auc_{run_name}_{dataset_type}.png"
            plt.savefig(roc_path, dpi=100, bbox_inches='tight')
            mlflow.log_artifact(roc_path)
            plt.close()
        except Exception as e:
            print(f"Предупреждение: не удалось сохранить ROC кривую: {e}")

In [None]:
def _save_precision_recall_curve(y_true, y_prob, run_name, dataset_type):
        """Сохраняет Precision-Recall кривую"""
        try:
            PrecisionRecallDisplay.from_predictions(y_true, y_prob)
            plt.title(f"Precision-Recall Curve: {run_name} ({dataset_type})")
            pr_path = f"../reports/figures/prec_recall/prec_recall_{run_name}_{dataset_type}.png"
            plt.savefig(pr_path, dpi=100, bbox_inches='tight')
            mlflow.log_artifact(pr_path)
            plt.close()
        except Exception as e:
            print(f"Предупреждение: не удалось сохранить Precision-Recall кривую: {e}")

In [152]:
from imblearn.over_sampling import RandomOverSampler

def oversample_dataset(dataframe, oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    
    if oversample:
        print("Oversampling ACTIVATED ✅")
        ros = RandomOverSampler(random_state=40)
        x, y = ros.fit_resample(x, y)
    else:
        print("Oversampling skipped ❌")
    data =np.hstack((x, np.reshape(y, (-1, 1)))) 
    return data, x, y

In [153]:
train, valid, test = np.split(df.sample(frac=1,random_state=40), [int(0.6*len(df)), int(0.8*len(df))]
)
train_orig = train.copy()
valid_orig = valid.copy()
test_orig  = test.copy()

  return bound(*args, **kwds)


In [154]:
print(len(train[train["LUNG_CANCER"] == 1])) #gamma
print(len(train[train["LUNG_CANCER"] == 0])) #gamma

166
19


In [155]:
train, X_train, y_train = oversample_dataset(train_orig, oversample=True)
valid, X_valid, y_valid = oversample_dataset(valid_orig, oversample=False)
test, X_test, y_test = oversample_dataset(test_orig, oversample=False)   

Oversampling ACTIVATED ✅
Oversampling skipped ❌
Oversampling skipped ❌


In [156]:
# from sklearn.neighbors import  KNeighborsClassifier
# from sklearn.metrics import classification_report
# 
# param_grid = {
#     'knn__n_neighbors': [3, 5, 7, 9, 11],              # количество соседей
#     'knn__weights': ['uniform', 'distance'],           # веса
#     'knn__metric': ['euclidean', 'manhattan'],         # метрика
#     'knn__p': [1, 2],                                  # степень метрики Minkowski
#     'knn__algorithm': ['auto', 'kd_tree', 'ball_tree'] # способ поиска соседей
# }
# 
# best_knn, y_pred = run_experiment("KNN", "KNN_grid_search_oversample", param_grid, X_train, y_train, X_test, y_test)


In [157]:
# from sklearn.naive_bayes import GaussianNB
# def experiment_nb(run_name, x_tr, y_tr, x_te, y_te):
#     model = GaussianNB()
#     return run_experiment("GaussianNB", run_name, param_grid, x_tr, y_tr, x_te, y_te)

In [158]:
from sklearn.naive_bayes import GaussianNB
param_grid = {
    'model__var_smoothing': [1e-12, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6]
}

final_model, metrics_of_valid, metrics_of_test = run_experiment(
    model_name="GNB",          # имя модели
    model_class=GaussianNB,    # класс модели
    run_name="GaussianNB_oversample_scaler_mix",
    grid_param=param_grid,
    x_tr=X_train,
    y_tr=y_train,
    x_vl=X_valid,
    y_vl=y_valid,
    x_te=X_test,
    y_te=y_test,
    scaler=True,
    mix=True                  # Train + valid 
)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


2025/10/11 16:42:37 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


=== Validation Metrics ===
              precision    recall  f1-score   support

           0       0.67      0.60      0.63        10
           1       0.92      0.94      0.93        52

    accuracy                           0.89        62
   macro avg       0.80      0.77      0.78        62
weighted avg       0.88      0.89      0.88        62




=== Test Metrics ===
              precision    recall  f1-score   support

           0       0.62      0.80      0.70        10
           1       0.96      0.90      0.93        52

    accuracy                           0.89        62
   macro avg       0.79      0.85      0.81        62
weighted avg       0.90      0.89      0.89        62
🏃 View run GaussianNB_oversample_scaler_mix at: http://localhost:5000/#/experiments/890502249313353174/runs/03ab81e0cfb0471c854a69177896dbdc
🧪 View experiment at: http://localhost:5000/#/experiments/890502249313353174
