# Download and Inspect the Collection

The dataset was created from the Chronicling America collection — over 21 million digitized newspaper pages (1756–1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800–1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

In [1]:
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o validation.json

import json

files = ["train.json", "validation.json", "test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1352  100  1352    0     0   4471      0 --:--:-- --:--:-- --:--:--  4521

 16 71.5M   16 11.6M    0     0  12.6M      0  0:00:05 --:--:--  0:00:05 12.6M
 60 71.5M   60 43.1M    0     0  22.5M      0  0:00:03  0:00:01  0:00:02 31.5M
 89 71.5M   89 63.8M    0     0  21.8M      0  0:00:03  0:00:02  0:00:01 26.1M
100 71.5M  100 71.5M    0     0  23.0M      0  0:00:03  0:00:03 --:--:-- 27.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1356  100  1356    0     0   6352      0 --:--:-- --:--:-- --:--:--  6457

  0 1315M    0 12.1M    0     0  17.1M      0  0


===== train.json =====
Preview of first 500 characters:

[
    {
        "query_id": "train_1",
        "question": "Who is the author of the book, \"Horrors of Slavery, or the American Turf in Tripoli\"?",
        "answer": "WILLIAM RAY",
        "org_answer": "WILLIAM RAY",
        "para_id": "New_Hampshire_18070804_1",
        "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, 

Loaded 439302 items (list).
Dictionary keys: ['query_id', 'question', 'answer', 'org_answer', 'para_id', 'context', 'raw_ocr', 'publication_date', 'trans_que', 'trans_ans', 'url']
{
  "query_id": "train_1",
  "question": "Who is the author of the book, \"Horrors of Slavery, or the American Turf in Tripoli\"?",
  "answer": "WILLIAM RAY",
  "org_answer": "WILLIAM RAY",
  "para_id": "New_Hampshire_18070804_1",
  "context": "Aiscellaneous R

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [2]:
import json
import os

inputs = ["train.json", "validation.json", "test.json"]
output = "document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

Loaded 439302 records from train.json
Loaded 24111 records from validation.json
Loaded 24084 records from test.json
Wrote 131921 records to document_collection.json
[
  {
    "para_id": "New_Hampshire_18070804_1",
    "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, or the American Turf in Tripoli,\u201d VOTARIES of Freedom, arm! The British Lion roars! Legions of Valor, take th\u2019 alarm\u2014; Rash, rush to guard our shores! Behold the horrid deed\u2014 Your brethren gasping lie! Beneath a tyrant\u2019s hand they bleed\u2014 They groan\u2014they faint\u2014they die. Veterans of seventy-six, Awake the slumbering sword;\u2014 Hearts of your murderous foes transfix\u2014 'Tis vengeance gives the word. Remember Lexington, And Bunker\u2019s tragic hill; \u201cThe same who spilt your blood thereon, Your blood again

## You should check that the collection you have matches that of the paper!

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [3]:
import json
import re
import unicodedata
import string

input_file = "test.json"
output_file = "test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

Saved 10000 entries to test_queries.json
[
  {
    "query_id": "test_1",
    "question": "How many lots did Thomas Peirce have"
  },
  {
    "query_id": "test_10",
    "question": "Who gave Hamilton the substance of what he had proposed on the part of General Hamilton"
  },
  {
    "query_id": "test_100",
    "question": "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
  }
]


# Create the Qrels for the test set

In [4]:
input_file = "test.json"
qrels_file = "test_qrels.json"
answers_file = "test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

Saved 24084 entries to test_qrels.json
Saved 24084 entries to test_query_answers.json
Sample qrels entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1}
Sample query_answers entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1, 'answer': '183', 'org_answer': '183'}


# Retrieval - Good Luck!

# Get the Index of the collection and store the document as a dataframe
inverted index


In [51]:
import os
import pyterrier as pt
import json
import pandas as pd 
import shutil

#path index folder 
folder_name = "Index"
index_file_name = "terrier_inverted_index"
index_path = os.path.abspath(os.path.join(folder_name, index_file_name))

#init pyTer
if not pt.java.started():
    pt.java.init()

#make a data frame from all the documents
#build Data frame and clean the row context field 
def  get_dataFrame():
    json_path = 'document_collection.json'
    parquet_path = 'documents.parquet'
    if os.path.exists(parquet_path):
        df_documents = pd.read_parquet(parquet_path)
    else:
        with open('document_collection.json', 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        df_documents = pd.DataFrame(raw_data)
        df_documents = df_documents.rename(columns={"para_id": "docno", "context": "text"})[["docno", "text", "publication_date"]]
        df_documents = df_documents.set_index('docno')
        df_documents.to_parquet(parquet_path)
    return df_documents
    

def get_index(df):
    #check if the index exsists
    properties_file = os.path.join(index_path, "data.properties")
    if os.path.exists(properties_file):
        return pt.IndexFactory.of(index_path)
    print("Index is not found, creating a new Index")
    if not os.path.exists(index_path):
        os.makedirs(index_path, exist_ok=True)
   
    
    # Build the index
    # Build the index using the updated IterDictIndexer signature
    # Key parameters now are: meta, text_attrs, meta_reverse, pretokenised, fields, threads
    indexer = pt.IterDictIndexer(
        index_path,
        meta={"docno": 40},            # store docno as metadata (up to 20 characters)
        text_attrs=["text"],           # which field(s) contain the text
        meta_reverse=["docno"],        # enable reverse lookup on docno
        pretokenised=False,
        fields=False,
        threads=1,
    )
    index_ref = indexer.index(df.to_dict(orient="records"))
    return pt.IndexFactory.of(index_ref)
    
df_documents = get_dataFrame()
index = get_index(df_documents)

# Print a simple summary
print("Index location:", index_path)
print("Indexed documents:", index.getCollectionStatistics().getNumberOfDocuments())



Index location: C:\Users\user\Desktop\school\3\Bicoca\InformationRetrival\Project\Index\terrier_inverted_index
Indexed documents: 131921


# stats of the indexing


In [52]:
# Retrieve collection statistics
stats = index.getCollectionStatistics()

print("Terrier Collection Statistics")
print("--------------------------------")
print(f"Indexed documents:        {stats.getNumberOfDocuments()}")
print(f"Unique terms (vocabulary): {stats.getNumberOfUniqueTerms()}")
print(f"Total tokens:             {stats.getNumberOfTokens()}")
print(f"Average document length:  {stats.getAverageDocumentLength():.2f}")

Terrier Collection Statistics
--------------------------------
Indexed documents:        131921
Unique terms (vocabulary): 236646
Total tokens:             15575099
Average document length:  118.06


# querry processing with NER to obtain relevant terms
We use SpaCy 

In [53]:
#download SpaCy
!pip install spacy -q
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [54]:
import spacy
import math
nlp = spacy.load("en_core_web_sm", disable=["parser"])

def sortByIDF(tokens_list, index ): #sort a list by the IDF score of the tockens, return a list of tupples by this format: (token , idf score of tokens)
    tokens_with_scores = [(token, get_word_importance(token, index)) for token in tokens_list  ]
    tokens_with_scores.sort(reverse = True, key = lambda tupple: tupple[1])
    tokens_with_scores= [t_s for t_s in tokens_with_scores if t_s[1] > 0]
    return tokens_with_scores

    

#return the IDF score of the word, 0 if didnt find
def get_word_importance(word, index):
    lex = index.getLexicon()

    #Stats of the word from the index
    lex_entry =  lex.getLexiconEntry(word.lower())
    if lex_entry is None : 
        return 0
    #calculating IDF
    
    N = index.getCollectionStatistics().getNumberOfDocuments()
    n = lex_entry.getDocumentFrequency()
    return math.log2(N / n)
    
   
#return a tuple of all the tokens of the query question and all the enteties 
def get_query_tokens_and_enteties(question, index):
    
    doc = nlp(question) # tokkanization of the query, Stop words removal
    entities = list(set([ent.text for ent in doc.ents]))
   
    important_terms = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct and not token.i in entities and token.pos_ in ["NOUN", "VERB", "PROPN", "ADJ"]
    ]
    tokens_with_scores = sortByIDF(important_terms,index)
    return (tokens_with_scores, entities)
    


"""
@param: recive a query question 
reconize enteties, removing stop words sorting by idf score. 
@return: 2 lists, the first is a list of enteties, the second is a list of tockens sorted by the idf score. 
"""
def process_query(query_question,index):
    if not index:
        try:
            index = get_index(get_dataFrame())
        except:
            print("error while creating index")
            return
    tockens_and_enteties = get_query_tokens_and_enteties(query_question, index)
    return tockens_and_enteties
    

In [55]:
#checking query process 
query_question = "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
process_query(query_question, index)

([('INN', 8.063870878041314),
  ('inform', 4.512710559843624),
  ('friend', 3.361856287964306),
  ('take', 2.769417891973364),
  ('PUBLIC', 2.605503782633197)],
 ['FRIENDS', 'INN'])

In [56]:
# First stage retriver with BM25 
bm25 = pt.terrier.Retriever(index, wmodel="BM25")


In [57]:
#testing the bm25 engine 
query = "Europe"
results = bm25.search(query)

#show first 10 resaults
print(results.head(10))


  qid   docid                     docno  rank      score   query
0   1  122024      New_York_19190610_27     0  10.233243  Europe
1   1   61883         Texas_18771215_13     1   9.759719  Europe
2   1    9637      Maryland_18401110_20     2   9.730148  Europe
3   1  110750      Nebraska_19020225_26     3   9.715430  Europe
4   1   36539  Rhode_Island_18511030_25     4   9.642500  Europe
5   1   47808      Illinois_18680907_40     5   9.578485  Europe
6   1  100207         Kansas_18970902_5     6   9.570657  Europe
7   1   65964     Tennessee_18740915_14     7   9.556417  Europe
8   1   29371        Hawaii_18520522_33     8   9.509314  Europe
9   1   38017     Tennessee_18590726_13     9   9.499877  Europe


In [58]:
#gettting the text of the document by its "docno"
def get_text_by_docno(docno):
    return df_documents.loc[docno]['text']


# hist word vectors embedding
using hist word embadding to retrive the closest term to the query term for each decade from 1800 to 1920. 


In [71]:
#changing the root directory to hist_words
import sys 
def init_HistWord():
    status = True
    path = os.getcwd()
    home_dir = path.split("\\")[-1]
    if( home_dir == "histwords_master"): 
        os.chdir("..") # changing root directory to the project root folder
        print(f"Working directory set to: {os.getcwd()}")
    project_home = os.path.abspath("histwords_master")
    if project_home not in sys.path:
        sys.path.insert(0, project_home)
    home_dir = path.split("\\")[-1]
    if( home_dir != "histwords_master"): 
        os.chdir(project_home)
        print(f"Working directory set to: {os.getcwd()}")
    try:
        from representations.sequentialembedding import SequentialEmbedding
        fiction_embeddings = SequentialEmbedding.load("embeddings/sgns", list(range(1810, 2000, 10)))
    except:
        status = False
    finally:
        if(status): # if expansion passed 
            print("Successfuly loaded HistWord")
        else:
            print("Failed to load HistWord")
            fiction_embeddings= None
        path = os.getcwd()
        home_dir = path.split("\\")[-1]
        if( home_dir == "histwords_master"): 
            os.chdir("..") # changing root directory to the project root folder 
            print(f"Working directory set to: {os.getcwd()}")
        print(type(fiction_embeddings))
        return fiction_embeddings
    
def getSimilar_words(word, fiction_embeddings):
    similar_words  = fiction_embeddings.get_seq_closest(word,1810, 110 )
    return similar_words

In [13]:
path = os.getcwd()
home_dir = path.split("\\")[-1]
if( home_dir == "histwords_master"): 
    os.chdir("..") # changing root directory to the original    


# query with BM25 foreach word in the resault of the histWord similar words


In [61]:
#recive word and use histWord embedding to get similar words from diffrent times and then create the 1oo most relevant documents 
def bm25_simWords(query_word, fiction_embeddings):
    maximum_length = 100
    results = bm25.search(query_word).head(maximum_length)
    similar_words = getSimilar_words(query_word, fiction_embeddings) #get the relevant similar words to the query word
    for word in similar_words:
        temp_bm25_resaults =  bm25.search(word).head(maximum_length)
        results = pd.concat([results,temp_bm25_resaults] , ignore_index= True).drop_duplicates(subset='docid', keep='first')
        results = results.sort_values(by = "score", ascending=False)
        results = results.head(maximum_length)
    results['rank'] = range(len(results))
    return results
#get the 100 top results of bm25 retrivial
def bm25Search(word):
     return bm25.search(word)


# Query process and first stage retrival
The query process outputs a list of terms sorted by importance and a list of enteties.
We will not use histWord embedding on Enteties but just on the terms of the query question.

In [82]:
query_question = ""
fiction_embeddings = init_HistWord()
def firstStageRetrival(query_question, fiction_embeddings ):   
    terms_and_ent = process_query(query_question,index)
    print(terms_and_ent)
    terms = terms_and_ent[0] # terms of the query sorted by IDF score
    entities = terms_and_ent[1] # entities from the query
    #calculating total IDF score
    total_IDF_score = 0
    for t in terms:
        total_IDF_score += t[1]
    bm25df = pd.DataFrame(columns=['qid', 'docid', 'docno', 'rank', 'score', 'query'])
    #get the 100 top resaults of the bm25 on the entities
    maximum_length = 100
    for ent in entities:
        temp_bm25_resaults =  bm25Search(ent).head(maximum_length)
        temp_bm25_resaults['query'] =[[ent]]* len(temp_bm25_resaults)
        print("concate temp df: \n" , temp_bm25_resaults, "\n with the df: \n " , bm25df)
        bm25df = pd.concat([bm25df,temp_bm25_resaults] , ignore_index= True)
        bm25df = bm25df.groupby(['docid', 'docno'], as_index=False).agg({
    'score': 'sum',
    'qid': lambda x: list({q for sublist in x for q in sublist}),
    'query': lambda x: list({q for sublist in x for q in sublist})
})# aggrigate score field in already exsists documents in the results table 
        bm25df = bm25df.sort_values(by = "score", ascending=False)
        bm25df = bm25df.head(maximum_length)
    bm25df['rank'] = range(len(bm25df))
    bm25df['score']  = bm25df['score']  *total_IDF_score# multipling all the score of the entities because we want to give more wight to entities than tokens
    
    #now run bm25 search on the tokens, documents that are already in the table are summed score
    for term in terms:
        temp_result =  bm25_simWords(term[0], fiction_embeddings) # get the top 100 relevant documents with bm25 
        temp_result['score'] = temp_result['score'] * term[1] # weight the column by its IDF score
        temp_result['query'] = [[term[0]]] * len(temp_result)  
        for docid in temp_result['docid']:
            if docid in bm25df['docid']:
                print("found!!!!!!!!!!!!!!!!!" , docid)
        print("concate temp df: \n" , temp_result, "\n with the df: \n " , bm25df)
        bm25df = pd.concat([bm25df, temp_result], ignore_index=True)
        bm25df = bm25df.groupby(['docid', 'docno'], as_index=False).agg({
    'score': 'sum',
    'qid': lambda x: list({q for sublist in x for q in sublist}),
    'query': lambda x: list({q for sublist in x for q in sublist})
}) # aggrigate score field in already exsists documents in the results table 
        bm25df = bm25df.sort_values(by = "score", ascending=False)
        bm25df = bm25df.head(maximum_length)
    bm25df['rank'] = range(len(bm25df))
    return bm25df
    
firstStageRetrival(query_question,fiction_embeddings)
    
    
    


Working directory set to: C:\Users\user\Desktop\school\3\Bicoca\InformationRetrival\Project\histwords_master
Successfuly loaded HistWord
Working directory set to: C:\Users\user\Desktop\school\3\Bicoca\InformationRetrival\Project
<class 'representations.sequentialembedding.SequentialEmbedding'>
([('group', 7.797426419873223), ('crew', 6.948618782731673), ('team', 6.332476107945073), ('old', 2.8159433645977296)], [])
concate temp df: 
    qid   docid                   docno  rank       score    query
0    1  117038     Arizona_19100519_12     0  108.573532  [group]
1    1  124690       Kansas_19160913_1     1  106.076879  [group]
2    1    8127       Hawaii_18401024_4     2  105.427590  [group]
3    1  110486    Michigan_19070621_19     3  104.797309  [group]
4    1  101377     Wyoming_19000119_11     4  104.640914  [group]
..  ..     ...                     ...   ...         ...      ...
89   1   72593    Arkansas_18900305_13    95   81.063656  [group]
90   1  130788   Tennessee_1858011

  bm25df = pd.concat([bm25df, temp_result], ignore_index=True)


concate temp df: 
    qid   docid                            docno  rank       score   query
0    1   16609              Vermont_18450606_13     0  103.051406  [crew]
1    1  120117                Oregon_19181111_6     1   96.603792  [crew]
2    1  124879         Connecticut__19140522_11     2   90.279225  [crew]
3    1   98899                Maine_18970626_25     3   89.759318  [crew]
4    1   45689            Tennessee_18610704_15     4   89.662660  [crew]
..  ..     ...                              ...   ...         ...     ...
93   1   33623             Michigan_18600607_14    95   76.132201  [crew]
94   1   99685          Connecticut__18960522_6    96   76.132201  [crew]
95   1  109596  District_of_Columbia_19070324_4    97   76.066205  [crew]
96   1  105766                  Iowa_19071121_6    98   75.946385  [crew]
99   1  119159              Indiana_19160229_16    99   75.884919  [crew]

[100 rows x 6 columns] 
 with the df: 
       docid                   docno       score  qid

Unnamed: 0,docid,docno,score,qid,query,rank
64,51244,Connecticut__18700519_16,172.360847,[1],"[team, group]",0
199,131137,Maryland_18800911_5,171.508636,[1],"[crew, group]",1
95,69440,Kansas_18770504_16,165.896528,[1],"[crew, group]",2
134,95641,Illinois_18951214_6,108.739318,[1],[team],3
173,117038,Arizona_19100519_12,108.573532,[1],[group],4
...,...,...,...,...,...,...
177,119992,North_Dakota_19110719_2,87.490395,[1],[group],95
53,39680,Michigan_18600221_11,87.407979,[1],[team],96
185,124333,California_19120519_13,87.350065,[1],[crew],97
33,17890,Missouri_18490531_6,87.272605,[1],[group],98
