In [None]:
import mlflow
import mlflow.sklearn
from gensim.models import Word2Vec
import numpy as np

# End any active runs
if mlflow.active_run():
    mlflow.end_run()

# Start MLflow tracking
mlflow.start_run()

# Set tracking URI (optional)
# mlflow.set_tracking_uri("your_tracking_uri")

# Define your data and labels
sentences = [["I", "love", "machine", "learning"],
             ["I", "enjoy", "deep", "learning"],
             ["I", "am", "fascinated", "by", "AI"],
             ["I", "like", "NLP"]]

labels = [1, 1, 1, 0]

# Train Word2Vec model
model = Word2Vec(sentences, min_count=1)

# Generate word embeddings
embeddings = []
for sentence in sentences:
    sentence_emb = np.mean([model.wv[word] for word in sentence], axis=0)
    embeddings.append(sentence_emb)

# Convert embeddings to numpy array
embeddings = np.array(embeddings)

# Define your ML model and train
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(embeddings, labels)

# Log parameters, metrics, and model
mlflow.log_params({"min_count": 1})
mlflow.log_metric("accuracy", 0.85)
mlflow.sklearn.log_model(model, "model")

# End MLflow run
mlflow.end_run()

In [1]:
import mlflow

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

EXPERIMENT_NAME = "mlflow-demo"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

for idx, depth in enumerate([1, 2, 5, 10, 20]):
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Start MLflow
    RUN_NAME = f"run_{idx}"
    if mlflow.active_run():
        mlflow.end_run()
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id

        # Track parameters
        mlflow.log_param("depth", depth)

        # Track metrics
        mlflow.log_metric("accuracy", accuracy)

        # Track model
        mlflow.sklearn.log_model(clf, "classifier")



In [None]:
import mlflow
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

def fine_tune_word2vec(model, new_corpus, epochs):
    with mlflow.start_run():
        # Log the parameters
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("vector_size", model.vector_size)
        mlflow.log_param("window", model.window)
        mlflow.log_param("min_count", model.min_count)

        # Train the Word2Vec model on the new corpus
        model.build_vocab(new_corpus, update=True)
        model.train(new_corpus, total_examples=model.corpus_count, epochs=epochs)

        # Log the metrics
        # Perform any evaluation or metric calculation you want to track
        accuracy = 0.85
        mlflow.log_metric("accuracy", accuracy)

        # Log the trained model as an artifact
        # model.save("word2vec_model.bin")
        # mlflow.log_artifact("word2vec_model.bin")

        return model

# Example usage
pretrained_model = KeyedVectors.load("src/Oword2vec.wordvectors")
model = Word2Vec(vector_size=300, window=5, min_count=1, workers=5)
# Set the vocabulary and vectors of the model
model.build_vocab_from_freq(pretrained_model.key_to_index)
model.wv.key_to_index = pretrained_model.key_to_index
model.wv.vectors = pretrained_model.vectors
model.wv.index_to_key = pretrained_model.index_to_key
new_corpus = [["new", "text", "data"], ["more", "sentences"]]
epochs = 10

fine_tuned_model = fine_tune_word2vec(model, new_corpus, epochs)
