In [1]:
# data from https://github.com/alfredodeza/learn-retrieval-augmented-generation/tree/main
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from valor.enums import TaskType, EvaluationStatus
from valor import Annotation, Datum, Dataset, Model, GroundTruth, Label, Client, Prediction, viz, connect

NUMBER_OF_RECORDS = 50


# get data
df = pd.read_csv('./top_rated_wines.csv')
df = df[df['variety'].notna()].sample(NUMBER_OF_RECORDS) # remove any NaN values as it blows up serialization
len(df)

# connet to Valor API
connect("http://0.0.0.0:8000")
client = Client()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


Successfully connected to host at http://0.0.0.0:8000/


## Use Case #1: Evaluating Rankings via Relevant Doc Names

If we know in advance which docs are relevant to our request, then it's easy for us to calculate our various metrics. We just have to pass the relevant docs in our `Groundtruth` object, pass the ordered predictions in our `Prediction` object, and run `evaluate_ranking` to get our metrics.

In [2]:
# pick three wines at random to be our "relevant docs" for this example
relevant_wines = df[:10].loc[:, 'name'].sample(3).to_list()
relevant_wines

['Chateau Margaux 2015',
 'Kistler Vineyards Stone Flat Vineyard Chardonnay 2005',
 'Chateau Smith Haut Lafitte (1.5 Liter Futures Pre-Sale) 2019']

In [4]:
dataset = Dataset.create('relevant_wines_dataset')
model = Model.create('relevant_wines_model')

dataset.add_groundtruth(
    GroundTruth(
        datum=Datum(uid="wines"),
        annotations=[
            Annotation(
                task_type=TaskType.RANKING,
                labels=[Label(key="wine_recommender", value='first_recommendation')],
                ranking=relevant_wines
            )
        ],
    )
)
dataset.finalize()

# assume that the other predictions were delivered from a recommender system in order
model.add_prediction(
    dataset, 
    Prediction(
        datum=Datum(uid="wines"),
        annotations=[
            Annotation(
                task_type=TaskType.RANKING,
                labels=[Label(key="wine_recommender", value='first_recommendation')],
                ranking=df[:10].loc[:, 'name'].to_list()
            )
        ],
    )
)
model.finalize_inferences(dataset)

eval_job = model.evaluate_ranking(
    dataset,
    metrics_to_return=["MRRMetric", "PrecisionAtKMetric", 'RecallAtKMetric', 'APAtKMetric', 'ARAtKMetric', 'mAPAtKMetric', 'mARAtKMetric'],
    k_cutoffs=[3],
)

assert eval_job.wait_for_completion(timeout=30) == EvaluationStatus.DONE

eval_job.metrics

[{'type': 'MRRMetric',
  'parameters': {'label_key': 'wine_recommender'},
  'value': 0.25},
 {'type': 'PrecisionAtKMetric',
  'parameters': {'k': 3, 'annotation_id': 2},
  'value': 0.0,
  'label': {'key': 'wine_recommender', 'value': 'first_recommendation'}},
 {'type': 'RecallAtKMetric',
  'parameters': {'k': 3, 'annotation_id': 2},
  'value': 0.0,
  'label': {'key': 'wine_recommender', 'value': 'first_recommendation'}},
 {'type': 'APAtKMetric',
  'parameters': {'k_cutoffs': [3], 'annotation_id': 2},
  'value': 0.0,
  'label': {'key': 'wine_recommender', 'value': 'first_recommendation'}},
 {'type': 'ARAtKMetric',
  'parameters': {'k_cutoffs': [3], 'annotation_id': 2},
  'value': 0.0,
  'label': {'key': 'wine_recommender', 'value': 'first_recommendation'}},
 {'type': 'mAPAtKMetric',
  'parameters': {'k_cutoffs': [3], 'label_key': 'wine_recommender'},
  'value': 0.0},
 {'type': 'mARAtKMetric',
  'parameters': {'k_cutoffs': [3], 'label_key': 'wine_recommender'},
  'value': 0.0}]

Alternatively, say that we don't know all of the docs which are relevant to our request, but we do know at least two of them are. We can use embeddings to identify other relevant docs, then pass all of those relevant docs into the `ranking` attribute.

In [5]:
def find_similar_embeddings(relevant_embeddings, other_embeddings, similarity_cutoff=.95):
    """Find all embeddings in a list of other_embeddings that are similar to some set of known relevant_embeddings."""
    output = []

    for embedding in other_embeddings:
        intermediate_distances = []
        for relevant_embedding in relevant_embeddings:
            distance = util.cos_sim(embedding, relevant_embedding)
            intermediate_distances.append(distance)
        
        output.append(max(intermediate_distances).item())
    return [i for i, distance in enumerate(output) if distance >= similarity_cutoff]


In [6]:

# say we know for certain that items 5:9 are relevant to our query, but we we want to expand our search to other relevant docs using embedding distances
relevant_docs = df['notes'][5:9].tolist()

encoder = SentenceTransformer('all-MiniLM-L6-v2')
relevant_doc_embeddings = [encoder.encode(doc)for doc in relevant_docs]
other_embeddings = [encoder.encode(doc) for doc in df['notes']]

similar_embeddings = find_similar_embeddings(relevant_embeddings=relevant_doc_embeddings, other_embeddings=other_embeddings)
df.iloc[similar_embeddings]




Unnamed: 0,name,region,variety,rating,notes
439,Chateau d'Yquem Sauternes (375ML half-bottle) ...,"Sauternes, Bordeaux, France",Collectible,97.0,Discovering Chateau d'Yquem starts with the bo...
1173,Inglenook Rubicon 2002,"Napa Valley, California",Red Wine,96.0,"""This is the best Rubicon ever..."""
916,Domaine Saint Prefert Chateauneuf-du-Pape Coll...,"Chateauneuf-du-Pape, Rhone, France",Red Wine,98.0,"The tete de cuvee of the domaine, made from th..."
217,Bouchard Pere & Fils Chambertin Clos de Beze G...,"Burgundy, France",Red Wine,96.0,95


In [7]:
dataset = Dataset.create('relevant_notes_dataset')
model = Model.create('relevant_notes_model')

dataset.add_groundtruth(
    GroundTruth(
        datum=Datum(uid="wines"),
        annotations=[
            Annotation(
                task_type=TaskType.RANKING,
                labels=[Label(key="wine_recommender", value='second_recommendation')],
                ranking=df.iloc[similar_embeddings]['notes'].to_list()
            )
        ],
    )
)
dataset.finalize()

# assume that the other predictions were delivered from a recommender system in order
model.add_prediction(
    dataset, 
    Prediction(
        datum=Datum(uid="wines"),
        annotations=[
            Annotation(
                task_type=TaskType.RANKING,
                labels=[Label(key="wine_recommender", value='second_recommendation')],
                ranking=df.loc[:, 'notes'].to_list()
            )
        ],
    )
)
model.finalize_inferences(dataset)

eval_job = model.evaluate_ranking(
    dataset,
    metrics_to_return=["MRRMetric", "PrecisionAtKMetric", 'RecallAtKMetric', 'APAtKMetric', 'ARAtKMetric', 'mAPAtKMetric', 'mARAtKMetric'],
    k_cutoffs=[10],
)

assert eval_job.wait_for_completion(timeout=30) == EvaluationStatus.DONE

eval_job.metrics

[{'type': 'MRRMetric',
  'parameters': {'label_key': 'wine_recommender'},
  'value': 0.16666666666666666},
 {'type': 'PrecisionAtKMetric',
  'parameters': {'k': 10, 'annotation_id': 4},
  'value': 0.4,
  'label': {'key': 'wine_recommender', 'value': 'second_recommendation'}},
 {'type': 'RecallAtKMetric',
  'parameters': {'k': 10, 'annotation_id': 4},
  'value': 1.0,
  'label': {'key': 'wine_recommender', 'value': 'second_recommendation'}},
 {'type': 'APAtKMetric',
  'parameters': {'k_cutoffs': [10], 'annotation_id': 4},
  'value': 0.4,
  'label': {'key': 'wine_recommender', 'value': 'second_recommendation'}},
 {'type': 'ARAtKMetric',
  'parameters': {'k_cutoffs': [10], 'annotation_id': 4},
  'value': 1.0,
  'label': {'key': 'wine_recommender', 'value': 'second_recommendation'}},
 {'type': 'mAPAtKMetric',
  'parameters': {'k_cutoffs': [10], 'label_key': 'wine_recommender'},
  'value': 0.4},
 {'type': 'mARAtKMetric',
  'parameters': {'k_cutoffs': [10], 'label_key': 'wine_recommender'},
 

## Use Case #2: Evaluating Rankings via Embeddings
NOTE: The code below doesn't run yet as the `embedding` attribute of `Annotation` needs work.

### Create groundtruths and predictions

In [None]:
# create groundtruths using documents that we know are relevant to the question "Where is Capella, and why is it a great region for wines?"
encoder = SentenceTransformer('all-MiniLM-L6-v2')

relevant_docs = df['notes'][1:3].tolist()
df.drop(df.index[1:3])

dataset = Dataset.create(DATASET_NAME)
model = Model.create(MODEL_NAME)

for i, doc in enumerate(relevant_docs):
    dataset.add_groundtruth(
        GroundTruth(
            datum=Datum(uid="wines"),
            annotations=[
                Annotation(
                    task_type=TaskType.RANKING,
                    labels=[Label(key="docs related to Capella", value=f'doc #{i}')],
                    metadata={'content': doc},
                    embedding=encoder.encode(doc).tolist() # TODO: embedding can't handle nested lists at the moment
                )
            ],
        )
    )

dataset.finalize()

# create predictions for all of our other records
embeddings = [encoder.encode(doc) for doc in df.loc[:, 'notes']] # output is NUMBER_OF_RECORDS x 384 dimensions per record

# add the other docs as predictions
for i, doc in enumerate(df):
    model.add_prediction(
        dataset,
        Prediction(
            datum=Datum(uid="wines"),
            annotations=[
                Annotation(
                    task_type=TaskType.RANKING,
                    labels=[Label(key="docs related to Capella", value=f'doc #{i}')],
                    metadata={'content': doc},
                    embedding=embeddings[i]
                )
            ],
        )
    )
    

model.finalize_inferences(dataset)

### Run evaluations

In [None]:
eval_job = model.evaluate_ranking(
    dataset,
    metrics_to_return=["MRRMetric", "PrecisionAtKMetric", 'RecallAtKMetric', 'APAtKMetric', 'ARAtKMetric', 'mAPAtKMetric', 'mARAtKMetric'],
    k_cutoffs=[3],
    similarity_cutoff=.95 # vectors have to be 95% similar to the groundtruth vectors to be considered "relevant"
)

# behind the scenes, Valor should:
# - calculate the distance between each prediction and both groundtruths (taking the average of both distances)
# - figure out which predictions were "relevant" based on the cutoff
# - calculate the IR metrics (NOTE: assumes that the annotations are added in the order in which they were recommended)


# alternatives
# - the user passes a nested array of embeddings to `ranking` (note: this would be a pretty large array to store in Valor)
