In [35]:
import mlflow
import pandas as pd
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle

In [23]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my-fakenews-exp")

<Experiment: artifact_location='./mlruns/1', creation_time=1691048167168, experiment_id='1', last_update_time=1691048167168, lifecycle_stage='active', name='my-fakenews-exp', tags={}>

In [24]:
import pandas as pd
import numpy as np
import io

df = pd.read_csv("../data/data_embeddings.csv",index_col=0)

In [25]:
def convert(item):
    item = item.strip()  # remove spaces at the end
    item = item[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

In [26]:
df['vector'] = df['vector'].apply(convert)

In [27]:
df = shuffle(df)

In [28]:
X = df['vector']
y = df['label']

In [29]:
X = np.stack(X)
y = np.stack(y)

In [30]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [32]:
with mlflow.start_run():
    mlflow.set_tag("data_scientist", "Rollan")

    mlflow.log_param("data_path", "../data_embeddings.csv")

    C = 0.03
    mlflow.log_param("regularization", C)

    lr = LogisticRegression(C=C)
    lr.fit(X_train, y_train)
 
    y_pred = lr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("accuracy", rmse)

    mlflow.log_artifact(local_path="../models/svm_model.bin", artifact_path="models_pickle")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials  #helps to optimize hyperparameters
from hyperopt.pyll import scope

In [15]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

In [16]:
def obj(params):

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=30,
            evals=[(valid, "test")],
            early_stopping_rounds=20
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
    
    return {"loss":rmse, "status":STATUS_OK}


In [17]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=obj,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	test-rmse:0.43258                                 
[1]	test-rmse:0.37581                                 
[2]	test-rmse:0.32878                                 
[3]	test-rmse:0.28901                                 
[4]	test-rmse:0.25501                                 
[5]	test-rmse:0.22727                                 
[6]	test-rmse:0.20448                                 
[7]	test-rmse:0.18633                                 
[8]	test-rmse:0.17163                                 
[9]	test-rmse:0.15983                                 
[10]	test-rmse:0.15007                                
[11]	test-rmse:0.14211                                
[12]	test-rmse:0.13619                                
[13]	test-rmse:0.13153                                
[14]	test-rmse:0.12790                                
[15]	test-rmse:0.12494                                
[16]	test-rmse:0.12228                                
[17]	test-rmse:0.12032                                
[18]	test-

In [18]:
params = {
    'learning_rate': 0.13998783607276,
    'max_depth': 28,
    'min_child_weight':	0.8037214370553903,
    'objective': 'reg:linear',
    'reg_alpha': 0.007917567259199893,
    'reg_lambda': 0.00982476912100121,
    'seed': 42
}


mlflow.xgboost.autolog()
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=30,
            evals=[(valid, "test")],
            early_stopping_rounds=20
        )

2023/08/04 16:50:07 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e2d14322d756427ab9d2f2815ba001e0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	test-rmse:0.43360
[1]	test-rmse:0.37748
[2]	test-rmse:0.33018
[3]	test-rmse:0.29054
[4]	test-rmse:0.25783
[5]	test-rmse:0.23093
[6]	test-rmse:0.20910
[7]	test-rmse:0.19174
[8]	test-rmse:0.17592
[9]	test-rmse:0.16320
[10]	test-rmse:0.15308
[11]	test-rmse:0.14518
[12]	test-rmse:0.13902
[13]	test-rmse:0.13421
[14]	test-rmse:0.13060
[15]	test-rmse:0.12776
[16]	test-rmse:0.12554
[17]	test-rmse:0.12383
[18]	test-rmse:0.12253
[19]	test-rmse:0.12156
[20]	test-rmse:0.12072
[21]	test-rmse:0.12013
[22]	test-rmse:0.11963
[23]	test-rmse:0.11925
[24]	test-rmse:0.11897
[25]	test-rmse:0.11872
[26]	test-rmse:0.11856
[27]	test-rmse:0.11840
[28]	test-rmse:0.11827
[29]	test-rmse:0.11817




In [37]:
with mlflow.start_run():

    best_params = {
        'learning_rate': 0.13998783607276,
        'max_depth': 28,
        'min_child_weight':	0.8037214370553903,
        'objective': 'reg:linear',
        'reg_alpha': 0.007917567259199893,
        'reg_lambda': 0.00982476912100121,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=30,
            evals=[(valid, "test")],
            early_stopping_rounds=20)
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse) 
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	test-rmse:0.43360
[1]	test-rmse:0.37748
[2]	test-rmse:0.33018
[3]	test-rmse:0.29054
[4]	test-rmse:0.25783
[5]	test-rmse:0.23093
[6]	test-rmse:0.20910
[7]	test-rmse:0.19174
[8]	test-rmse:0.17592
[9]	test-rmse:0.16320
[10]	test-rmse:0.15308
[11]	test-rmse:0.14518
[12]	test-rmse:0.13902
[13]	test-rmse:0.13421
[14]	test-rmse:0.13060
[15]	test-rmse:0.12776
[16]	test-rmse:0.12554
[17]	test-rmse:0.12383
[18]	test-rmse:0.12253
[19]	test-rmse:0.12156
[20]	test-rmse:0.12072
[21]	test-rmse:0.12013
[22]	test-rmse:0.11963
[23]	test-rmse:0.11925
[24]	test-rmse:0.11897
[25]	test-rmse:0.11872
[26]	test-rmse:0.11856
[27]	test-rmse:0.11840
[28]	test-rmse:0.11827
[29]	test-rmse:0.11817




In [38]:
logged_model = 'runs:/28fdb5119a224831b63d449bfffbe244/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [39]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 28fdb5119a224831b63d449bfffbe244

In [40]:
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [41]:
xgboost_model

<xgboost.core.Booster at 0x7fecbf6d84f0>

In [43]:
y_pred = xgboost_model.predict(valid)

In [44]:
y_pred[:10]

array([0.9947031 , 0.00539737, 0.00812144, 0.00539737, 0.9946153 ,
       0.00539737, 0.99472636, 0.00539737, 0.99469644, 0.9601756 ],
      dtype=float32)

In [45]:
check the development branch

SyntaxError: invalid syntax (904230320.py, line 1)

### model registry

In [1]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [4]:
client.create_experiment("my_coll")

'2'

In [5]:
client.search_experiments()

[<Experiment: artifact_location='./mlruns/2', creation_time=1691325631447, experiment_id='2', last_update_time=1691325631447, lifecycle_stage='active', name='my_coll', tags={}>,
 <Experiment: artifact_location='./mlruns/1', creation_time=1691048167168, experiment_id='1', last_update_time=1691048167168, lifecycle_stage='active', name='my-fakenews-exp', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1691047498639, experiment_id='0', last_update_time=1691047498639, lifecycle_stage='active', name='Default', tags={}>]