In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

RETRIEVE_BASE_URL = os.getenv("RETRIEVE_BASE_URL")
RETRIEVE_ENDPOINT = os.getenv("RETRIEVE_ENDPOINT")
RETRIEVE_BASE_URL

'http://10.204.100.79:2323'

## Add retrieved facts

In [2]:
import json
import requests
from tqdm import tqdm

# enable tqdm for pandas
tqdm.pandas()

def get_facts(question):
    r = requests.get(
        f"{RETRIEVE_BASE_URL}{RETRIEVE_ENDPOINT}",
        params={"q": question},
        timeout=10,
    )
    r.raise_for_status()
    results = r.json()["results"]

    return [
        {"title": x["text"], "score": x["score"], "id": x["id"]}
        for x in results
    ]

## Add citations

In [3]:
import json

def get_citations(row):
    facts = eval(row["facts"])
    fact_ids = {d['id'] for d in facts}
    
    retrieved_facts = row["retrieved_facts"]
    retrieved_ids = [d["id"] for d in retrieved_facts]
    
    return [i+1 for i, fid in enumerate(retrieved_ids) if fid in fact_ids]

## Pipeline

In [None]:
from datasets import Dataset, DatasetDict


def pipeline(path):
    data = pd.read_csv(path)
    data['retrieved_facts'] = data['questions'].progress_apply(get_facts)
    data["citations"] = data.apply(get_citations, axis=1)

    return Dataset.from_pandas(data, preserve_index=False)


dataset = DatasetDict({
    "train": pipeline('data/30012026_MultiRAG_train_set.csv'),
    "validation": pipeline('data/30012026_MultiRAG_val_set.csv'),
    "test": pipeline('data/30012026_MultiRAG_test_set.csv'),
})

dataset.push_to_hub("weerayut/iq-rag", private=True)

 50%|████▉     | 42188/84752 [1:36:51<1:37:46,  7.25it/s]

In [None]:
dataset.push_to_hub("weerayut/iq-rag", private=True)