In [23]:
from datasets import load_dataset

# Load a small slice first to test
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train[:1%]")

print(ds[0])


{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During

In [9]:
import json

fever_unified = []

with open("train.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        if item['label'] != "SUPPORTS":
            continue  # skip REFUTES / NOT ENOUGH INFO

        for eg_idx, evidence_group in enumerate(item['evidence']):
            for ev_idx, ev in enumerate(evidence_group):
                article_title = ev[2] if ev[2] is not None else "NA"
                passage_id = f"fever_{item['id']}_{eg_idx}_{ev_idx}"
                entry = {
                    "id": passage_id,
                    "url": f"https://en.wikipedia.org/wiki/{article_title}" if article_title != "NA" else "NA",
                    "title": article_title,
                    "text": item['claim']  # simplest: use claim as text; later can replace with actual evidence text if available
                }
                fever_unified.append(entry)

print(f"Total SUPPORTS entries: {len(fever_unified)}")


Total SUPPORTS entries: 193756


In [10]:
fever_unified

[{'id': 'fever_75397_0_0',
  'url': 'https://en.wikipedia.org/wiki/Nikolaj_Coster-Waldau',
  'title': 'Nikolaj_Coster-Waldau',
  'text': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'},
 {'id': 'fever_75397_0_1',
  'url': 'https://en.wikipedia.org/wiki/Fox_Broadcasting_Company',
  'title': 'Fox_Broadcasting_Company',
  'text': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'},
 {'id': 'fever_150448_0_0',
  'url': 'https://en.wikipedia.org/wiki/Roman_Atwood',
  'title': 'Roman_Atwood',
  'text': 'Roman Atwood is a content creator.'},
 {'id': 'fever_150448_1_0',
  'url': 'https://en.wikipedia.org/wiki/Roman_Atwood',
  'title': 'Roman_Atwood',
  'text': 'Roman Atwood is a content creator.'},
 {'id': 'fever_214861_0_0',
  'url': 'https://en.wikipedia.org/wiki/History_of_art',
  'title': 'History_of_art',
  'text': 'History of art includes architecture, dance, sculpture, music, painting, poetry literature, theatre, narrative, film, photography and graphic

In [17]:
import pandas as pd
fev_ds = pd.DataFrame(fever_unified)

In [24]:
ds_wiki = ds.to_pandas()

In [27]:
ds_wiki.head()

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the fraction of sunlight that i...
2,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
3,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...
4,305,https://en.wikipedia.org/wiki/Achilles,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ..."


In [31]:
df_ret = pd.concat([ds_wiki, fev_ds])

In [32]:
df_ret

Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the fraction of sunlight that i...
2,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
3,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...
4,305,https://en.wikipedia.org/wiki/Achilles,Achilles,"In Greek mythology, Achilles ( ) or Achilleus ..."
...,...,...,...,...
193751,fever_13114_0_0,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.
193752,fever_13114_1_0,https://en.wikipedia.org/wiki/Gimli_-LRB-Middl...,Gimli_-LRB-Middle-earth-RRB-,J. R. R. Tolkien created Gimli.
193753,fever_152180_0_0,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.
193754,fever_152180_1_0,https://en.wikipedia.org/wiki/Susan_Sarandon,Susan_Sarandon,Susan Sarandon is an award winner.


In [36]:
df_ret_texts_list = df_ret['text'].to_list()

In [37]:
type(df_ret_texts_list)

list