In [28]:
import numpy as np
import pandas as pd
import mlflow
import os

In [29]:
input_file = '../../data/valid_data.csv'
output_file = 'output/rf_predict.parquet'
RUN_ID = os.getenv('RUN_ID',"7f2ba2ea16c449a88f0e61b087f13fc4")

In [30]:
def read_df(filename: str) -> pd.DataFrame:
    """Read data into dataframe"""
    df = pd.read_csv(filename, index_col=0)
    df['vector'] = df['vector'].apply(convert)
    return df

def convert(item):
    item = item.strip()  # remove spaces at the end
    item = item[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

In [31]:
def load_model(run_id):
    logged_model = f's3://mlflow-artifacts-remote-rollan/2/{run_id}/artifacts/model'
    model = mlflow.pyfunc.load_model(logged_model)
    return model

def apply_model(input_file, run_id, output_file):
    df = read_df(input_file)
    X = np.stack(df['vector'])

    model = load_model(run_id)
    y_pred = model.predict(X)

    df['predict'] = y_pred
    df['version'] = run_id
    df.to_parquet(output_file, index=False)

In [32]:
!mkdir output

mkdir: output: File exists


In [33]:
apply_model(input_file=input_file, run_id=RUN_ID, output_file=output_file)