In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

RETRIEVE_BASE_URL = os.getenv("RETRIEVE_BASE_URL")
RETRIEVE_ENDPOINT = os.getenv("RETRIEVE_ENDPOINT")
RETRIEVE_BASE_URL

'http://10.204.100.79:2323'

## Add retrieved facts

In [2]:
import json
import requests
from tqdm import tqdm

# enable tqdm for pandas
tqdm.pandas()

def get_facts(question):
    r = requests.get(
        f"{RETRIEVE_BASE_URL}{RETRIEVE_ENDPOINT}",
        params={"q": question},
        timeout=10,
    )
    r.raise_for_status()
    results = r.json()["results"]

    return [
        {"title": x["text"], "score": x["score"], "id": x["id"]}
        for x in results
    ]

## Add citations

In [3]:
import json

def get_citations(row):
    facts = eval(row["facts"])
    fact_ids = {d['id'] for d in facts}
    
    retrieved_facts = row["retrieved_facts"]
    retrieved_ids = [d["id"] for d in retrieved_facts]
    
    return [i+1 for i, fid in enumerate(retrieved_ids) if fid in fact_ids]

## Pipeline

In [None]:
from datasets import Dataset, DatasetDict


def pipeline(path):
    data = pd.read_csv(path)
    data['retrieved_facts'] = data['questions'].progress_apply(get_facts)
    data["citations"] = data.apply(get_citations, axis=1)

    return Dataset.from_pandas(data, preserve_index=False)


dataset = DatasetDict({
    "train": pipeline('data/30012026_MultiRAG_train_set.csv'),
    "validation": pipeline('data/30012026_MultiRAG_val_set.csv'),
    "test": pipeline('data/30012026_MultiRAG_test_set.csv'),
})

In [6]:
dataset.push_to_hub("weerayut/iq-rag", private=True)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/911 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/weerayut/iq-rag/commit/f07fcbd346eeb074f3004d5841e36784a4fb1697', commit_message='Upload dataset', commit_description='', oid='f07fcbd346eeb074f3004d5841e36784a4fb1697', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/weerayut/iq-rag', endpoint='https://huggingface.co', repo_type='dataset', repo_id='weerayut/iq-rag'), pr_revision=None, pr_num=None)