In [1]:
import joblib
import mlflow
import mlflow.data
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.models import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Build a model based on train data

# Set mlflow tracking server
mlflow.set_tracking_uri("http://localhost:5000")

# Set current experiment
mlflow.set_experiment('Lazada Reviews Classifications')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1735958237403, experiment_id='1', last_update_time=1735958237403, lifecycle_stage='active', name='Lazada Reviews Classifications', tags={}>

In [3]:
# Load train data (vectorized)
x_train_vec = joblib.load('../data/processed/x_train_vec_1.pkl')
x_test_vec = joblib.load('../data/processed/x_test_vec_1.pkl')

# Load train data (text)
x_train = joblib.load('../data/interim/x_train_1.pkl')
x_test = joblib.load('../data/interim/x_test_1.pkl')

# Load target data
y_train = joblib.load('../data/processed/y_train_1.pkl')
y_test = joblib.load('../data/processed/y_test_1.pkl')

# Load the vectorizer
vectorizer = joblib.load('../models/vectorizer_1.pkl')

In [4]:
# Create model instance and train the model
logreg = LogisticRegression(max_iter=100)
logreg.fit(x_train_vec, y_train)

In [5]:
# Evaluate the model perfomance

# Predicting test data
y_pred = pd.Series(logreg.predict(x_train_vec), index = x_train.index)

# Evaluate the model
metrics = classification_report(y_train, y_pred, output_dict = True)

accuracy = metrics["accuracy"]

: 

In [None]:
# Logging

with mlflow.start_run(run_name="Minimum Effort"):
    dataset: PandasDataset = mlflow.data.from_pandas(
        pd.concat([x_train, y_train, y_pred], axis=1),
        source = "s3://mlops-lazada/20191002-reviews.csv",
        targets = "rating",
        name = "lazada reviews",
        predictions = 0
    )
    
    model_params = logreg.get_params()
    
    mlflow.log_params(model_params)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_input(dataset, "training")
    mlflow.log_input(dataset, "testing")    
    mlflow.log_artifact("../models/vectorizer_1.pkl", "vectorizer")
    
    
    mlflow.sklearn.log_model(
        sk_model=logreg,
        artifact_path="models",
        serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
        registered_model_name="Untouch Logistic Regression",
        input_example=x_test_vec
    )
    
    # Add metadata using tags
    mlflow.set_tags({"dataset_config": "review contents", "experiment_notes": "Vectorized logistic regression on review data"})

  return _dataset_source_registry.resolve(
