In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
data = pd.read_pickle('../data/redakt_dataset2.pkl').sample(frac=0.3, random_state = 2)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 99 to 71
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       91 non-null     int8    
 1   sex       91 non-null     category
 2   cp        91 non-null     category
 3   trestbps  91 non-null     int16   
 4   restecg   91 non-null     category
 5   thalach   91 non-null     int16   
 6   exang     91 non-null     category
 7   oldpeak   91 non-null     float16 
 8   slope     91 non-null     category
 9   ca        91 non-null     category
 10  thal      91 non-null     category
 11  target    91 non-null     category
dtypes: category(8), float16(1), int16(2), int8(1)
memory usage: 2.2 KB


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.3, random_state=40)

In [5]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [6]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['age', 'trestbps', 'thalach', 'oldpeak']

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import TargetEncoder

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', TargetEncoder(), cat_features)
    ]
)

In [9]:
pipeline = Pipeline(steps=[
    ('transform', preprocessor),
    ('model', RandomForestClassifier(random_state=40))
])

In [10]:
pipeline.fit(X_train, y_train)

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Для ROC AUC нужно использовать вероятности

metrics = {}

# Метрики для классификации
metrics["accuracy"] = accuracy_score(y_test, y_pred)
metrics["precision"] = precision_score(y_test, y_pred)
metrics["recall"] = recall_score(y_test, y_pred)
metrics["f1"] = f1_score(y_test, y_pred)
metrics["roc_auc"] = roc_auc_score(y_test, y_pred_proba)

print(metrics)

{'accuracy': 0.8928571428571429, 'precision': 1.0, 'recall': 0.8333333333333334, 'f1': 0.9090909090909091, 'roc_auc': np.float64(0.9750000000000001)}


In [35]:
import mlflow
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri) 

In [31]:
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = pipeline.get_params()

experiment_id = mlflow.create_experiment('heart diseases')

with mlflow.start_run(run_name='baseline model', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact('../requirements.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

RestException: RESOURCE_ALREADY_EXISTS: Experiment(name=heart diseases) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(sqlite3.IntegrityError) UNIQUE constraint failed: experiments.name
[SQL: INSERT INTO experiments (name, artifact_location, lifecycle_stage, creation_time, last_update_time) VALUES (?, ?, ?, ?, ?)]
[parameters: ('heart diseases', '', 'active', 1734383368603, 1734383368603)]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [13]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder

X_train_copy = X_train.copy()

In [14]:
pf = PolynomialFeatures(degree=2)
pf.fit_transform(X_train_copy[['age','oldpeak']])

array([[1.0000e+00, 5.4000e+01, 1.2002e+00, 2.9160e+03, 6.4812e+01,
        1.4404e+00],
       [1.0000e+00, 6.1000e+01, 1.0000e+00, 3.7200e+03, 6.1000e+01,
        1.0000e+00],
       [1.0000e+00, 7.0000e+01, 2.4004e+00, 4.9000e+03, 1.6800e+02,
        5.7617e+00],
       [1.0000e+00, 5.6000e+01, 0.0000e+00, 3.1360e+03, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 4.4000e+01, 6.0010e-01, 1.9360e+03, 2.6406e+01,
        3.6011e-01],
       [1.0000e+00, 4.6000e+01, 1.4004e+00, 2.1160e+03, 6.4438e+01,
        1.9609e+00],
       [1.0000e+00, 6.0000e+01, 1.2002e+00, 3.6000e+03, 7.2000e+01,
        1.4404e+00],
       [1.0000e+00, 4.0000e+01, 1.4004e+00, 1.6000e+03, 5.6000e+01,
        1.9609e+00],
       [1.0000e+00, 5.3000e+01, 1.2002e+00, 2.8080e+03, 6.3625e+01,
        1.4404e+00],
       [1.0000e+00, 5.6000e+01, 7.9980e-01, 3.1360e+03, 4.4781e+01,
        6.3965e-01],
       [1.0000e+00, 3.8000e+01, 3.8008e+00, 1.4440e+03, 1.4438e+02,
        1.4445e+01],
       [1.0000e+00, 4

In [15]:
pf_pipeline = Pipeline(steps=[
    ('poly', pf),
    ('scale', StandardScaler())
])

preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999), cat_features),
        ('poly', pf_pipeline, ['age', 'oldpeak'])
    ],
    remainder='drop',
)
X_train_copy[['age', 'oldpeak']] = X_train_copy[['age', 'oldpeak']].astype('float128')
X_train_copy_raw = preprocessor_sklearn.fit_transform(X_train_copy)
X_train_copy = pd.DataFrame(X_train_copy_raw, columns=preprocessor_sklearn.get_feature_names_out())

In [16]:
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_copy)

Unnamed: 0,num__age,num__trestbps,num__thalach,num__oldpeak,cat__sex,cat__cp,cat__restecg,cat__exang,cat__slope,cat__ca,cat__thal,poly__1,poly__age,poly__oldpeak,poly__age^2,poly__age oldpeak,poly__oldpeak^2
0,0.020919,0.460302,0.392412,0.183108,1.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.020919,0.183108,-0.057287,0.187327,-0.185523
1,0.859570,1.141030,-0.635806,-0.012410,1.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.859570,-0.012410,0.849776,0.117590,-0.314511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,-1.177155,-0.901155,0.973578,-0.989048,1.0,1.0,1.0,0.0,2.0,0.0,2.0,0.0,-1.177155,-0.989048,-1.161538,-0.998777,-0.607355
62,-0.218696,-0.356572,0.437117,-0.989048,1.0,0.0,1.0,1.0,2.0,1.0,2.0,0.0,-0.218696,-0.989048,-0.296166,-0.998777,-0.607355


In [17]:
column_names = X_train_copy.columns.tolist()

# Вывести названия столбцов
print(column_names)

['num__age', 'num__trestbps', 'num__thalach', 'num__oldpeak', 'cat__sex', 'cat__cp', 'cat__restecg', 'cat__exang', 'cat__slope', 'cat__ca', 'cat__thal', 'poly__1', 'poly__age', 'poly__oldpeak', 'poly__age^2', 'poly__age oldpeak', 'poly__oldpeak^2']


In [18]:
with open('column.txt', 'w') as f:
    for column in column_names:
        f.write(column + '\n')

In [19]:
pipeline_sklearn = Pipeline(steps=[
    ('transform', preprocessor_sklearn),
    ('model', RandomForestClassifier(random_state=43))
])

model_sklearn = pipeline_sklearn.fit(X_train, y_train)

y_pred2 = model_sklearn.predict(X_test)
y_pred_proba2 = model_sklearn.predict_proba(X_test)[:, 1] 

metrics2 = {}

# Метрики для классификации
metrics2["accuracy"] = accuracy_score(y_test, y_pred2)
metrics2["precision"] = precision_score(y_test, y_pred2)
metrics2["recall"] = recall_score(y_test, y_pred2)
metrics2["f1"] = f1_score(y_test, y_pred2)
metrics2["roc_auc"] = roc_auc_score(y_test, y_pred_proba2)

print(metrics2)

{'accuracy': 0.8571428571428571, 'precision': 0.9375, 'recall': 0.8333333333333334, 'f1': 0.8823529411764706, 'roc_auc': np.float64(0.9416666666666667)}


In [26]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = model_sklearn.get_params()

with mlflow.start_run(run_name='new_features', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(model_sklearn, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics2)
    mlflow.log_artifact('column.txt')
    mlflow.log_params(params_dic
RestException: RESOURCE_ALREADY_EXISTS: Experiment(name=heart diseases) already exists. Error: (raised as a result of Query-invoked autoflush; consider using a session.no_autoflush block if this flush is occurring prematurely)
(sqlite3.IntegrityError) UNIQUE constraint failed: experiments.name
[SQL: INSERT INTO experiments (name, artifact_location, lifecycle_stage, creation_time, last_update_time) VALUES (?, ?, ?, ?, ?)]
[parameters: ('heart diseases', '', 'active', 1734383488547, 1734383488547)]
(Background on this error at: https://sqlalche.me/e/20/gkpj)t)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/12/10 23:59:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run new_features at: http://127.0.0.1:5000/#/experiments/2/runs/299a13bb790c46c48d0ce0f3ccca698f.
2024/12/10 23:59:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.


In [20]:
from mlxtend.feature_selection import SequentialFeatureSelector

sfs = SequentialFeatureSelector(RandomForestClassifier(random_state=40), 
                                k_features=9,
                                forward=True,
                                floating=False, 
                                scoring='accuracy',
                                cv=2)

sfs.fit(X_train_copy,y_train)

selected_features_sfs = X_train_copy.loc[:, sfs.k_feature_names_]
selected_features_sfs
rfe_sfs_idx = list(sfs.k_feature_idx_)
print(rfe_sfs_idx)
rfe_sfs_col = list(sfs.k_feature_names_)
print(rfe_sfs_col)

[1, 2, 4, 5, 7, 9, 10, 11, 15]
['num__trestbps', 'num__thalach', 'cat__sex', 'cat__cp', 'cat__exang', 'cat__ca', 'cat__thal', 'poly__1', 'poly__age oldpeak']


In [21]:
with open('index.txt', 'w') as f:
    for i in rfe_sfs_idx:
        f.write(str(i) + '\n')

with open('column_new.txt', 'w') as f:
    for i in rfe_sfs_col:
        f.write(i + '\n')

In [22]:
class ColumnExtractor(object):

    def __init__(self, cols):
        self.cols = cols

    def transform(self, X):
        return X[:,self.cols]
    
    def fit(self, X, y=None):
        return self


rfe_sfs_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_sklearn), 
    ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
    ('model', RandomForestClassifier(random_state=40))
])

rfe_sfs_pipeline.fit(X_train, y_train)

In [23]:
predictions_sfs = rfe_sfs_pipeline.predict(X_test)
y_pred_proba3 = rfe_sfs_pipeline.predict_proba(X_test)[:, 1] 

metrics3 = {}

# Метрики для классификации
metrics3["accuracy"] = accuracy_score(y_test, predictions_sfs)
metrics3["precision"] = precision_score(y_test, predictions_sfs)
metrics3["recall"] = recall_score(y_test, predictions_sfs)
metrics3["f1"] = f1_score(y_test, predictions_sfs)
metrics3["roc_auc"] = roc_auc_score(y_test, y_pred_proba3)

print(metrics3)

{'accuracy': 0.7857142857142857, 'precision': 0.9285714285714286, 'recall': 0.7222222222222222, 'f1': 0.8125, 'roc_auc': np.float64(0.9472222222222222)}


In [32]:
signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = rfe_sfs_pipeline.get_params()

with mlflow.start_run(run_name='filtered_features', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(rfe_sfs_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics3)
    mlflow.log_artifact('index.txt')
    mlflow.log_artifact('column_new.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/12/11 00:07:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run filtered_features at: http://127.0.0.1:5000/#/experiments/2/runs/d534ebdda24d4a1d851d19686df2ad39.
2024/12/11 00:07:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.


In [24]:
import optuna

def objective(trial):
    #
    n_estimators = trial.suggest_int('n_estimators', 10, 900)
    max_depth = trial.suggest_int('max_depth', 1, 120)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)
    opt_pipeline  = Pipeline(steps=[
        ('preprocessor', preprocessor_sklearn),
        ('model', RandomForestClassifier(n_estimators=n_estimators, 
                                       max_depth=max_depth, 
                                       max_features=max_features, 
                                       random_state=40))
    ])
    opt_pipeline.fit(X_train, y_train)
    preds = opt_pipeline.predict(X_test)
    score = f1_score(y_test, preds, average='weighted')
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params) 

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-12-17 00:03:56,955] A new study created in memory with name: no-name-aed27cef-e308-44c2-8e4b-721c8b4ae4fc
[I 2024-12-17 00:03:57,053] Trial 0 finished with value: 0.8594346829640946 and parameters: {'n_estimators': 56, 'max_depth': 45, 'max_features': 0.44114685446358626}. Best is trial 0 with value: 0.8594346829640946.
[I 2024-12-17 00:03:57,237] Trial 1 finished with value: 0.893877551020408 and parameters: {'n_estimators': 183, 'max_depth': 49, 'max_features': 0.37997222517951745}. Best is trial 1 with value: 0.893877551020408.
[I 2024-12-17 00:03:57,728] Trial 2 finished with value: 0.8594346829640946 and parameters: {'n_estimators': 598, 'max_depth': 72, 'max_features': 0.6758861632707365}. Best is trial 1 with value: 0.893877551020408.
[I 2024-12-17 00:03:58,109] Trial 3 finished with value: 0.8594346829640946 and parameters: {'n_estimators': 450, 'max_depth': 30, 'max_features': 0.2940995692956586}. Best is trial 1 with v

Number of finished trials: 30
Best trial: {'n_estimators': 183, 'max_depth': 49, 'max_features': 0.37997222517951745}


In [41]:
opt_pipeline  = Pipeline(steps=[
        ('preprocessor', preprocessor_sklearn),
        ('model', RandomForestClassifier(n_estimators=183, 
                                       max_depth=49, 
                                       max_features=00.37997222517951745, 
                                       random_state=40))
    ])
opt_pipeline.fit(X_train, y_train)

In [42]:
predictions_opt = opt_pipeline.predict(X_test)
y_pred_proba4 = opt_pipeline.predict_proba(X_test)[:, 1] 

metrics4 = {}

# Метрики для классификации
metrics4["accuracy"] = accuracy_score(y_test, predictions_opt)
metrics4["precision"] = precision_score(y_test, predictions_opt)
metrics4["recall"] = recall_score(y_test, predictions_opt)
metrics4["f1"] = f1_score(y_test, predictions_opt)
metrics4["roc_auc"] = roc_auc_score(y_test, y_pred_proba4)

print(metrics4)

{'accuracy': 0.8928571428571429, 'precision': 0.9411764705882353, 'recall': 0.8888888888888888, 'f1': 0.9142857142857143, 'roc_auc': np.float64(0.9388888888888889)}


In [47]:
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
params_dict = opt_pipeline.get_params()
experiment_name = "heart diseases"
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id


with mlflow.start_run(run_name='best_model', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(opt_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_metrics(metrics4)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

Downloading artifacts: 100%|█████████████████████| 7/7 [00:00<00:00, 154.00it/s]
2024/12/17 16:33:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_model at: http://127.0.0.1:5000/#/experiments/2/runs/ae6886d1298c48f2bcdf142133514bec.
2024/12/17 16:33:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.


In [44]:
X = data.drop('target', axis=1)
y = data['target']

In [45]:
opt_pipeline.fit(X, y)

In [46]:
with open('columns_for_training_final_model.txt', 'w') as f:
    for column in column_names:
        f.write(column + '\n')

signature =  infer_signature(model_input = X.head(5))
input_example = X.head(5)
params_dict = opt_pipeline.get_params()

with mlflow.start_run(run_name='final_model_2', experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(opt_pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             )
    mlflow.log_artifact('../requirements.txt')
    mlflow.log_artifact('columns_for_training_final_model.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

Downloading artifacts: 100%|█████████████████████| 7/7 [00:00<00:00, 206.32it/s]
2024/12/17 16:31:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run final_model_2 at: http://127.0.0.1:5000/#/experiments/2/runs/2877ebce28504b268555d41f58b7fb57.
2024/12/17 16:31:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.
