# Convert Data to Json files

In [2]:
# import requests, zipfile, io
# from pathlib import Path

# zip_url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip"
# out_dir = Path("fiqa")
# out_dir.mkdir(parents=True, exist_ok=True)

# # Download the ZIP into memory (fine for small/medium files)
# resp = requests.get(zip_url, timeout=120)
# resp.raise_for_status()

# # Extract all files
# with zipfile.ZipFile(io.BytesIO(resp.content)) as z:
#     z.extractall(out_dir)

# print(f"Extracted to: {out_dir.resolve()}")


In [None]:
import json

def jsonl_to_json(jsonl_file, json_file):
    data = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))

    with open(json_file, 'w',encoding='utf-8') as f:
        json.dump(data, f,indent=2, ensure_ascii=False)


jsonl_to_json(r"fiqa\corpus.jsonl", "data/corpus.json")
jsonl_to_json(r"fiqa\queries.jsonl", "data/queries.json")

In [None]:
import pandas as pd
import os

In [None]:
def load_corpus(path):
    corpus = {}
    ext = os.path.splitext(path.lower())[1]
    with open(path, "r", encoding="utf-8") as f:
        if ext == ".jsonl":
            raw = [json.loads(line) for line in f if line.strip()]
        elif ext == ".json":
            raw = json.load(f)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")

    if isinstance(raw, dict):  
        for k, v in raw.items():
            corpus[str(k)] = {
                "title": v.get("title", ""),
                "text": v.get("text", ""),
                "metadata": v.get("metadata", {}) or {},
            }
    else:  
        for obj in raw:
            did = str(obj["_id"])
            corpus[did] = {
                "title": obj.get("title", ""),
                "text": obj.get("text", ""),
                "metadata": obj.get("metadata", {}) or {},
            }
    return corpus


In [5]:
def load_queries(path):
    queries = {}
    ext = os.path.splitext(path.lower())[1]
    with open(path, "r", encoding="utf-8") as f:
        if ext == ".jsonl":
            raw = [json.loads(line) for line in f if line.strip()]
        elif ext == ".json":
            raw = json.load(f)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")

    if isinstance(raw, dict):
        for k, v in raw.items():
            if isinstance(v, str):
                queries[str(k)] = v
            else:
                queries[str(k)] = v.get("text", "")
    else:
        for obj in raw:
            qid = str(obj["_id"])
            queries[qid] = obj.get("text", "")
    return queries


In [6]:
def load_qrels_tsv(path):
    df = pd.read_csv(path, sep="\t", dtype=str, header=0)
    cols = {c.lower().strip(): c for c in df.columns}

    def pick(*cands):
        for c in cands:
            if c in cols:
                return cols[c]
        raise ValueError(f"Missing columns in {path}")

    qcol = pick("query-id", "qid", "query_id")
    dcol = pick("corpus-id", "docid", "doc-id", "doc_id")
    scol = pick("score", "rel", "relevance")

    df[scol] = df[scol].astype(int)

    qrels = {}
    for _, row in df.iterrows():
        qid = str(row[qcol])
        did = str(row[dcol])
        score = int(row[scol])
        qrels.setdefault(qid, {})[did] = score
    return qrels


In [7]:
def validate_split(name, corpus, queries, qrels):
    corpus_ids = set(corpus.keys())
    query_ids = set(queries.keys())
    qrels_qids = set(qrels.keys())
    qrels_dids = {d for docs in qrels.values() for d in docs}

    orphan_q = qrels_qids - query_ids
    orphan_d = qrels_dids - corpus_ids

    print(f"[{name}] queries in qrels: {len(qrels_qids)} | pairs: {sum(len(v) for v in qrels.values())}")
    print(f"  orphan queries: {len(orphan_q)}")
    print(f"  orphan docs:    {len(orphan_d)}")
    if orphan_q: print("   e.g.", list(orphan_q)[:5])
    if orphan_d: print("   e.g.", list(orphan_d)[:5])


In [10]:
corpus = load_corpus("data/corpus.json")
queries = load_queries("data/queries.json")
qrels_train = load_qrels_tsv("fiqa/qrels/train.tsv")
qrels_dev = load_qrels_tsv("fiqa/qrels/dev.tsv")
qrels_test = load_qrels_tsv("fiqa/qrels/test.tsv")

print(f"Corpus size:  {len(corpus)}")
print(f"Queries size: {len(queries)}")

validate_split("train", corpus, queries, qrels_train)
validate_split("dev", corpus, queries, qrels_dev)
validate_split("test", corpus, queries, qrels_test)


Corpus size:  57638
Queries size: 6648
[train] queries in qrels: 5500 | pairs: 14166
  orphan queries: 0
  orphan docs:    0
[dev] queries in qrels: 500 | pairs: 1238
  orphan queries: 0
  orphan docs:    0
[test] queries in qrels: 648 | pairs: 1706
  orphan queries: 0
  orphan docs:    0


In [21]:
def merge_qrels(*splits):
    merged = {}
    for qrels in splits:
        for qid, docs in qrels.items():
            dst = merged.setdefault(qid, {})
            for did, sc in docs.items():
                dst[did] = max(sc, dst.get(did, 0))
    return merged

qrels_all = merge_qrels(qrels_train, qrels_dev, qrels_test)
print(f"Merged qrels: {len(qrels_all)} queries, {sum(len(v) for v in qrels_all.values())} pairs")


Merged qrels: 6648 queries, 17110 pairs


In [22]:
qrels_df = pd.DataFrame(qrels_all)

In [24]:
qrels_df.to_csv("data/qrels_all.csv", index=False)

In [3]:
with open ("data/corpus.json", 'r', encoding='utf-8') as f:
    documents_raw =json.load(f)

In [5]:
# Create function to convert tsv files to csv
import pandas as pd
def load_tsv_files(tsv_file):
    df = pd.read_csv(tsv_file, sep="\t")
    return df


In [6]:
dev_df = load_tsv_files(tsv_file= r"fiqa\qrels\dev.tsv")
test_df = load_tsv_files(tsv_file= r"fiqa\qrels\test.tsv")
train_df = load_tsv_files(tsv_file= r"fiqa\qrels\train.tsv")


In [7]:
len(dev_df), len(test_df), len(train_df)

(1238, 1706, 14166)

In [8]:
qrels = pd.concat([train_df, dev_df, test_df], ignore_index=True)

In [9]:
with open("data/queries.json",'r',encoding="utf-8") as f:
    queries_doc = json.load(f)
  