# Changing data files to appropriate format to use for QPP

In [None]:
import pandas as pd

queries_df = pd.read_xml("./datasets/kid-friend-en/queries/queries.xml")
queries_df["metadata"] = [dict(row[["category", "description", "narrative"]]) for _,row in queries_df.iterrows()]
queries_df = queries_df.drop(labels=["category","description","narrative"],axis=1)
queries_df = queries_df.drop_duplicates(subset=["number"])
queries_df.to_csv("./datasets/kid-friend-en/queries/queries.tsv", index=False, header=False, sep="\t")

In [None]:
import pandas as pd

with open("./datasets/kid-friend-en/qrels/qrels-relevance.txt", "r") as f_qrel:
    lines= f_qrel.readlines()

qrels = []
for line in lines:
    qid, q0, docid, score = line.split(" ")
    score = int(score)
    qrels.append([qid, q0, docid, score])

qrels_df = pd.DataFrame(qrels, columns=["qid", "q0", "docid", "score"])
qrels_df = qrels_df.drop_duplicates()

for qid in qrels_df["qid"].unique():
    for docid in qrels_df.loc[qrels_df["qid"]==qid]["docid"].unique():
        if len(qrels_df.loc[(qrels_df["qid"]==qid) & (qrels_df["docid"]==docid)]) > 1:
            i = qrels_df.loc[(qrels_df["qid"]==qid) & (qrels_df["docid"]==docid) & (qrels_df["score"]==1)].index  # id qid-docid pair has more than 1 relevance scores, drop row where score = 1 so that docid is completely irrelevant (score = 0) or completely relevant (score  = 2)
            qrels_df = qrels_df.drop(i)

qrels_df.to_csv("./datasets/kid-friend-en/qrels/qrels-relevance.tsv", index=False, header=False, sep="\t")

In [5]:
import json
import pandas as pd


lines = []
with open(r'datasets/kid-friend-en/raw_data/documents.jsonl/documents.jsonl') as f:
    lines = f.read().splitlines()

line_dicts = [json.loads(line) for line in lines]
df_final = pd.DataFrame(line_dicts)

df_final.head(1)

Unnamed: 0,docno,snippet,title,main_content
0,6e421f1539b1457b853712d81be87743,WEBBTS (also Bangtan Boys; Korean: 방탄소년단 Bangt...,BTS (band) - Wikipedia,BTS (also Bangtan Boys; Korean: 방탄소년단 Bangtan ...


In [6]:
df = pd.read_csv("datasets/kid-friend-en/corpus/documents.jsonl", sep="\t")
df.iloc[0]

{"id":"6e421f1539b1457b853712d81be87743","contents":"BTS (band) - Wikipedia. WEBBTS (also Bangtan Boys; Korean: 방탄소년단 Bangtan Sonyeondan) is a South Korean boy group of the third K-pop generation, consisting of seven members, which was founded in 2010 by Big Hit Entertainment and debuted in 2013.. BTS (also Bangtan Boys; Korean: 방탄소년단 Bangtan Sonyeondan) is a South Korean boy group of the third K-pop generation, consisting of seven members, which was founded in 2010 by Big Hit Entertainment and debuted in 2013.[1][2] The name of the band Bangtan Sonyeondan is a combination of 방탄 (\"bulletproof\") and 소년단 (\"scouts\").[3] In July 2017, the acronym \"BTS\" was also given the meaning Beyond the Scene.[4] Their fans are organized in a fan club called \"A.R.M.Y\" (아미), which stands for \"Adorable Representative M.C for Youth\".[5]\n\nBTS\n\nBTS (2022)\nGeneral information\nOrigin Seoul, South Korea\nGenre(s) K-Pop, Hip-Hop, R&B\nFounded in 2010\nWebsite bts.ibighit.com\nCurrent line-up\nJin

# Making the CLEAR_QPP dataset

## Using [CLEAR](https://www.commonlit.org/blog/introducing-the-clear-corpus-an-open-dataset-to-advance-research-28ff8cfea84a/) corpus

In [None]:
import pandas as pd

clear = pd.read_excel("./datasets/clear_qpp/raw_files/clear/CLEAR Corpus 6.01.xlsx")
clear.head()

Unnamed: 0,ID,Last Changed,Author,Title,Anthology,URL,Source,Pub Year,Category,Location,...,CAREC_M,CARES,CML2RI,firstPlace_pred,secondPlace_pred,thirdPlace_pred,fourthPlace_pred,fifthPlace_pred,sixthPlace_pred,Kaggle split
0,400,,Carolyn Wells,Patty's Suitors,,http://www.gutenberg.org/cache/epub/5631/pg563...,gutenberg,1914.0,Lit,mid,...,0.11952,0.457534,12.097815,-0.383831,-0.283604,-0.346879,-0.28162,-0.247767,-0.289945,Train
1,401,,Carolyn Wells,Two Little Women on a Holiday,,http://www.gutenberg.org/cache/epub/5893/pg589...,gutenberg,1917.0,Lit,mid,...,0.04921,0.46251,22.550179,-0.260307,-0.20996,-0.061565,-0.234231,-0.201347,-0.156156,Train
2,402,,Carolyn Wells,Patty Blossom,,http://www.gutenberg.org/cache/epub/20945/pg20...,gutenberg,1917.0,Lit,mid,...,0.09724,0.369259,18.125279,-0.615037,-0.5306,-0.527847,-0.55018,-0.565762,-0.538852,Train
3,403,,CHARLES KINGSLEY,THE WATER-BABIES\nA Fairy Tale for a Land-Baby,,http://www.gutenberg.org/files/25564/25564-h/2...,gutenberg,1863.0,Lit,mid,...,0.08856,0.390759,10.95946,-1.528806,-1.525546,-1.471455,-1.265776,-1.422547,-1.393155,Test
4,404,,Charles Kingsley,HOW THE ARGONAUTS WERE DRIVEN INTO THE UNKNOWN...,The Heroes\n or Greek Fairy Tales for my...,http://www.gutenberg.org/files/677/677-h/677-h...,gutenberg,1889.0,Lit,mid,...,0.08798,0.389226,3.19596,-1.335586,-1.321922,-1.163985,-1.122501,-1.185518,-1.271324,Train


In [3]:
from spacheallen.formula import spache_allen
from tqdm.notebook import tqdm

clear_df = clear[["ID", "Title", "Excerpt"]]
spache_allen_scores = []

for _, row in tqdm(clear_df.iterrows(), total=len(clear_df)):
    spache_allen_scores.append(int(spache_allen(row["Excerpt"])))

clear_df["spache_allen"] = spache_allen_scores


  0%|          | 0/4724 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clear_df["spache_allen"] = spache_allen_scores


In [65]:
clear_df.head()

Unnamed: 0,ID,Title,Excerpt,spache_allen
0,400,Patty's Suitors,When the young people returned to the ballroom...,4
1,401,Two Little Women on a Holiday,"All through dinner time, Mrs. Fayre was somewh...",3
2,402,Patty Blossom,"As Roger had predicted, the snow departed as q...",4
3,403,THE WATER-BABIES\nA Fairy Tale for a Land-Baby,Mr. Grimes was to come up next morning to Sir ...,9
4,404,HOW THE ARGONAUTS WERE DRIVEN INTO THE UNKNOWN...,And outside before the palace a great garden w...,6


In [None]:
clear_df_value_counts = clear_df.Title.value_counts().rename_axis('unique_values').reset_index(name='counts')
print(len(clear_df_value_counts.loc[clear_df_value_counts["counts"]>1]))

clear_df_value_counts.loc[clear_df_value_counts["counts"]>1] # number of article titles with more than one excerpt in dataset

46


Unnamed: 0,unique_values,counts
0,Invention and Discovery,13
1,?,8
2,Current History,4
3,LITTLE MISCHIEF,3
4,Bacteria,2
5,THE FAIRY GODMOTHERS,2
6,Protestant_Reformation,2
7,Fungus,2
8,Monitress Merle,2
9,Metabolism,2


In [None]:
clear_df_k5 = clear_df.drop_duplicates(subset=["Title"]).loc[clear_df["spache_allen"]<=5]
clear_df_not_k5 = clear_df.drop_duplicates(subset=["Title"]).loc[clear_df["spache_allen"]>5]

# for query in clear_df_k5["Title"].unique():
#     if len(clear_df_not_k5.loc[clear_df_not_k5["Title"]==query]) > 0: # check if same article title has excerpts suitable for grade <=5 and for grade > 5
#         print(query)

clear_df_k5.head()

Unnamed: 0,ID,Title,Excerpt,spache_allen
0,400,Patty's Suitors,When the young people returned to the ballroom...,4
1,401,Two Little Women on a Holiday,"All through dinner time, Mrs. Fayre was somewh...",3
2,402,Patty Blossom,"As Roger had predicted, the snow departed as q...",4
5,405,The Three Little Bears,Once upon a time there were Three Bears who li...,5
8,408,The Boy Allies At Verdun,"On the twenty-second of February, 1916, an aut...",5


In [81]:
len(clear_df), len(clear_df_k5), len(clear_df_not_k5)

(4724, 3257, 1401)

In [25]:
len(clear_df), len(clear_df_k5), len(clear_df_not_k5)

(4724, 3302, 1422)

In [89]:
clear_queries = []

for _, row in clear_df_k5.sample(500, random_state=123).iterrows():
    id = "qcl"+str(row["ID"])
    query = row["Title"]
    clear_queries.append([id, query])

for _, row in clear_df_not_k5.sample(500, random_state=123).iterrows():
    id = "qcl"+str(row["ID"])
    query = row["Title"]
    clear_queries.append([id, query])

clear_queries_df = pd.DataFrame(clear_queries, columns=["id", "query"])
clear_queries_df.to_csv("./datasets/clear_qpp/raw_files/clear/clear_queries.tsv", sep='\t', index=False)
clear_queries_df

Unnamed: 0,id,query
0,qcl3658,All About Coffee
1,qcl3389,Bhujar
2,qcl5749,WHAT I SAW AT THE SEASHORE
3,qcl6351,Shakespeare's Christmas
4,qcl3412,Lion and Warthog
...,...,...
995,qcl4484,KNIGHTS OF ART\nSTORIES OF THE ITALIAN PAINTERS
996,qcl6066,PRINCE CHERRY
997,qcl5126,"DETERMINATION OF NITROGEN IN HAIR, WOOL, DRIED..."
998,qcl1525,Ringrose And His Buccaneers


In [90]:
clear_queries_df.columns = ["id", "query"]

len(clear_queries_df), len(clear_queries_df["query"].unique())

(1000, 1000)

In [91]:
clear_docs = []

for _, row in clear_df.iterrows():
    id = "dcl"+str(row["ID"])
    contents = row["Excerpt"]
    clear_docs.append([id,contents])

clear_docs_df = pd.DataFrame(clear_docs, columns=["id", "contents"])
clear_docs_df.to_csv("./datasets/clear_qpp/raw_files/clear/clear_corpus.tsv", sep="\t", index=False)
clear_docs_df

Unnamed: 0,id,contents
0,dcl400,When the young people returned to the ballroom...
1,dcl401,"All through dinner time, Mrs. Fayre was somewh..."
2,dcl402,"As Roger had predicted, the snow departed as q..."
3,dcl403,Mr. Grimes was to come up next morning to Sir ...
4,dcl404,And outside before the palace a great garden w...
...,...,...
4719,dcl8027,The name Monarch means “king”. An adult Monarc...
4720,dcl8028,"Walking Sticks are long, thin, and slow-moving..."
4721,dcl8029,A Black Widow is a shiny black spider. It has ...
4722,dcl8030,Solids are shapes that you can actually touch....


In [92]:
len(clear_docs_df["contents"].unique()), len(clear_docs_df)

(4724, 4724)

In [117]:
clear_qrels = [] # qid, q0, docid, score

for _, row in clear_queries_df.iterrows():
    qid = row["id"]
    q0 = 0
    query = row["query"]
    relevant_excerpts = clear_df.loc[clear_df["Title"]==query] # finding the excerpts associated with the title to serve as the relevant text sample(s)
    for _, row in relevant_excerpts.iterrows():
        docid = "dcl"+ str(row["ID"])
        if row["spache_allen"] <= 5:
            score = 3
        elif row["spache_allen"] > 5:
            score = 2
        clear_qrels.append([qid, q0, docid, score])

    random_clear_df_row = clear_df.loc[clear_df["Title"]!=query].sample(1, random_state=123).iloc[0] # picking a random excerpt as the irrelevant text sample
    docid = "dcl" + str(random_clear_df_row["ID"])
    score = 0
    clear_qrels.append([qid, q0, docid, score])


clear_qrels_df = pd.DataFrame(clear_qrels, columns=["qid", "q0", "docid", "score"])
clear_qrels_df.to_csv("./datasets/clear_qpp/raw_files/clear/clear_qrels.tsv", index=False, sep="\t")
clear_qrels_df

Unnamed: 0,qid,q0,docid,score
0,qcl3658,0,dcl3658,3
1,qcl3658,0,dcl5323,0
2,qcl3389,0,dcl3389,3
3,qcl3389,0,dcl5323,0
4,qcl5749,0,dcl5749,3
...,...,...,...,...
2005,qcl5126,0,dcl5323,0
2006,qcl1525,0,dcl1525,2
2007,qcl1525,0,dcl5323,0
2008,qcl6206,0,dcl6206,2


In [118]:
# qrel stats

len(clear_qrels_df.qid.unique()), clear_qrels_df.qid.value_counts().mean(), clear_qrels_df.qid.value_counts().max(), clear_qrels_df.qid.value_counts().min()

(1000, 2.01, 3, 2)

## Using the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad/tree/main) dataset

In [119]:
squad_train = pd.read_parquet('./datasets/clear_qpp/raw_files/squad/train.parquet', engine='pyarrow')
squad_valid = pd.read_parquet('./datasets/clear_qpp/raw_files/squad/validation.parquet', engine='pyarrow')
squad = pd.concat([squad_train, squad_valid], ignore_index=True)
print(len(squad))
squad.head()

98169


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [132]:
squad_corpus_df = squad.drop_duplicates(subset=["context"], ignore_index=True)[["id", "context", "title"]]
squad_corpus_df["id"] = "dsq" + squad_corpus_df["id"]
squad_corpus_df.columns = ["id", "contents", "title"]

squad_corpus_df_final = squad_corpus_df[["id", "contents"]]
squad_corpus_df_final.to_csv("./datasets/clear_qpp/raw_files/squad/squad_corpus.tsv", index=False, sep='\t')
squad_corpus_df_final

Unnamed: 0,id,contents
0,dsq5733be284776f41900661182,"Architecturally, the school has a Catholic cha..."
1,dsq5733bf84d058e614000b61be,"As at most other universities, Notre Dame's st..."
2,dsq5733bed24776f41900661188,The university is the major seat of the Congre...
3,dsq5733a6424776f41900660f51,The College of Engineering was established in ...
4,dsq5733a70c4776f41900660f64,All of Notre Dame's undergraduate students are...
...,...,...
20953,dsq5737a5931c456719005744e7,"where is the mass of the object, is the velo..."
20954,dsq5737a7351c456719005744f1,A conservative force that acts on a closed sys...
20955,dsq5737a84dc3c5551400e51f59,"For certain physical scenarios, it is impossib..."
20956,dsq5737a9afc3c5551400e51f61,The connection between macroscopic nonconserva...


In [133]:
squad_corpus_df

Unnamed: 0,id,contents,title
0,dsq5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame
1,dsq5733bf84d058e614000b61be,"As at most other universities, Notre Dame's st...",University_of_Notre_Dame
2,dsq5733bed24776f41900661188,The university is the major seat of the Congre...,University_of_Notre_Dame
3,dsq5733a6424776f41900660f51,The College of Engineering was established in ...,University_of_Notre_Dame
4,dsq5733a70c4776f41900660f64,All of Notre Dame's undergraduate students are...,University_of_Notre_Dame
...,...,...,...
20953,dsq5737a5931c456719005744e7,"where is the mass of the object, is the velo...",Force
20954,dsq5737a7351c456719005744f1,A conservative force that acts on a closed sys...,Force
20955,dsq5737a84dc3c5551400e51f59,"For certain physical scenarios, it is impossib...",Force
20956,dsq5737a9afc3c5551400e51f61,The connection between macroscopic nonconserva...,Force


In [130]:
len(squad_corpus_df), len(squad_corpus_df["contents"].unique())

(20958, 20958)

In [121]:
squad_queries_df = squad.drop_duplicates(subset=["question"], ignore_index=True).sample(500, random_state=123)[["id", "question"]]
squad_queries_df["id"] = "qsq" + squad_queries_df["id"]
squad_queries_df.columns = ["id", "query"]
squad_queries_df.to_csv("./datasets/clear_qpp/raw_files/squad/squad_queries.tsv", index=False, sep="\t")
squad_queries_df

Unnamed: 0,id,query
72640,qsq572edf54dfa6aa1500f8d48b,What year did the terror bombing policy become...
22467,qsq56fa88b8f34c681400b0c0e9,Which service contains 10 HD channels and fini...
95270,qsq57283dbeff5b5019007d9fca,What Doctor Who episode won a Hugo Award in 2010?
87840,qsq56d71bd80d65d2140019835e,When is the game's media day usually held?
84263,qsq5731ddefb9d445190005e5d8,What is the profession of Philip Hamburger?
...,...,...
82330,qsq5731360e497a881900248c4e,What declaration solidified Confucian values?
31153,qsq570cc952b3d812140066d276,Which Latin father described the belief that J...
29851,qsq570b8145ec8fbc190045ba3f,"how much prarie land does the ""Buffalo Commons..."
63958,qsq5728a8e0ff5b5019007da3c4,What major event happened in Alaska on March 2...


In [129]:
len(squad_queries_df), len(squad_queries_df["query"].unique())

(500, 500)

In [123]:
from tqdm.notebook import tqdm

squad_qrels = [] # qid, q0, docid, score

for _, row in tqdm(squad_queries_df.iterrows(), total=len(squad_queries_df)):
    qid = row["id"]
    q0 = 0
    query = row["query"]
    relevant_content = squad.loc[squad["question"]==query].iloc[0]["context"]
    docid = squad_corpus_df.loc[squad_corpus_df["contents"]==relevant_content].iloc[0]["id"]
    spache_allen_score = int(spache_allen(relevant_content))
    if spache_allen_score <= 5:
        score = 2
    elif spache_allen_score > 5:
        score = 1
    squad_qrels.append([qid, q0, docid, score])

    relevant_content_title = squad_corpus_df.loc[squad_corpus_df["contents"]==relevant_content].iloc[0]["title"]
    irrelevant_content_row = squad_corpus_df.loc[squad_corpus_df["title"]!=relevant_content_title].sample(1, random_state = 123).iloc[0]
    docid = irrelevant_content_row["id"]
    score = 0
    squad_qrels.append([qid, q0, docid, score])

squad_qrels_df = pd.DataFrame(squad_qrels, columns=["qid", "q0", "docid", "score"])
squad_qrels_df.to_csv("./datasets/clear_qpp/raw_files/squad/squad_qrels.tsv", index=False, sep="\t")
squad_qrels_df

  0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,qid,q0,docid,score
0,qsq572edf54dfa6aa1500f8d48b,0,dsq572edf54dfa6aa1500f8d489,1
1,qsq572edf54dfa6aa1500f8d48b,0,dsq572e8c9003f989190075676b,0
2,qsq56fa88b8f34c681400b0c0e9,0,dsq56fa88b8f34c681400b0c0e7,1
3,qsq56fa88b8f34c681400b0c0e9,0,dsq5726f50add62a815002e9630,0
4,qsq57283dbeff5b5019007d9fca,0,dsq57283dbeff5b5019007d9fc6,1
...,...,...,...,...
995,qsq570b8145ec8fbc190045ba3f,0,dsq56ce7ba4aab44d1400b887ed,0
996,qsq5728a8e0ff5b5019007da3c4,0,dsq5728a8e0ff5b5019007da3c4,1
997,qsq5728a8e0ff5b5019007da3c4,0,dsq57268d7ef1498d1400e8e392,0
998,qsq572a14abaf94a219006aa7bc,0,dsq572a14abaf94a219006aa7bb,1


In [126]:
squad_qrels_df.loc[squad_qrels_df["score"]==2]

Unnamed: 0,qid,q0,docid,score
10,qsq5711623e50c2381900b54abb,0,dsq5711623e50c2381900b54ab9,2
20,qsq5731dc25b9d445190005e5ca,0,dsq5731dc25b9d445190005e5c7,2
30,qsq57309634069b5314008321b3,0,dsq57309634069b5314008321af,2
42,qsq56df680f8bc80c19004e4bdd,0,dsq56df680f8bc80c19004e4bdd,2
56,qsq56df9d3c4a1a83140091eb91,0,dsq56df9d3c4a1a83140091eb90,2
...,...,...,...,...
962,qsq56bfb676a10cfb1400551266,0,dsq56beb50f3aeaaa14008c926f,2
964,qsq57279bd5dd62a815002ea1dc,0,dsq57279bd5dd62a815002ea1dc,2
968,qsq572922ae1d0469140077909e,0,dsq5726fee55951b619008f8445,2
982,qsq571c3a685efbb31900334db4,0,dsq571a484210f8ca1400304fbd,2


In [124]:
# qrel stats

len(squad_qrels_df.qid.unique()), squad_qrels_df.qid.value_counts().mean(), squad_qrels_df.qid.value_counts().max(), squad_qrels_df.qid.value_counts().min()

(500, 2.0, 2, 2)

## Combining squad and clear to create clear_qpp

In [137]:
clear_qpp_corpus = pd.concat([clear_docs_df, squad_corpus_df_final], ignore_index=True)
clear_qpp_corpus.to_csv("./datasets/clear_qpp/raw_files/clear_qpp_raw/clear_qpp_corpus.tsv", index=False, sep="\t")
clear_qpp_corpus

Unnamed: 0,id,contents
0,dcl400,When the young people returned to the ballroom...
1,dcl401,"All through dinner time, Mrs. Fayre was somewh..."
2,dcl402,"As Roger had predicted, the snow departed as q..."
3,dcl403,Mr. Grimes was to come up next morning to Sir ...
4,dcl404,And outside before the palace a great garden w...
...,...,...
25677,dsq5737a5931c456719005744e7,"where is the mass of the object, is the velo..."
25678,dsq5737a7351c456719005744f1,A conservative force that acts on a closed sys...
25679,dsq5737a84dc3c5551400e51f59,"For certain physical scenarios, it is impossib..."
25680,dsq5737a9afc3c5551400e51f61,The connection between macroscopic nonconserva...


In [142]:
len(clear_qpp_corpus), len(clear_qpp_corpus["contents"].unique()), len(clear_qpp_corpus["id"].unique())

(25682, 25682, 25682)

In [138]:
clear_qpp_queries = pd.concat([clear_queries_df, squad_queries_df], ignore_index=True)
clear_qpp_queries.to_csv("./datasets/clear_qpp/raw_files/clear_qpp_raw/clear_qpp_queries.tsv", index=False, sep="\t")
clear_qpp_queries

Unnamed: 0,id,query
0,qcl3658,All About Coffee
1,qcl3389,Bhujar
2,qcl5749,WHAT I SAW AT THE SEASHORE
3,qcl6351,Shakespeare's Christmas
4,qcl3412,Lion and Warthog
...,...,...
1495,qsq5731360e497a881900248c4e,What declaration solidified Confucian values?
1496,qsq570cc952b3d812140066d276,Which Latin father described the belief that J...
1497,qsq570b8145ec8fbc190045ba3f,"how much prarie land does the ""Buffalo Commons..."
1498,qsq5728a8e0ff5b5019007da3c4,What major event happened in Alaska on March 2...


In [143]:
len(clear_qpp_queries), len(clear_qpp_queries["query"].unique()), len(clear_qpp_queries["id"].unique())

(1500, 1500, 1500)

In [139]:
clear_qpp_qrels = pd.concat([clear_qrels_df, squad_qrels_df], ignore_index=True)
clear_qpp_qrels.to_csv("./datasets/clear_qpp/raw_files/clear_qpp_raw/clear_qpp_qrels.tsv", index=False, sep="\t")
clear_qpp_qrels

Unnamed: 0,qid,q0,docid,score
0,qcl3658,0,dcl3658,3
1,qcl3658,0,dcl5323,0
2,qcl3389,0,dcl3389,3
3,qcl3389,0,dcl5323,0
4,qcl5749,0,dcl5749,3
...,...,...,...,...
3005,qsq570b8145ec8fbc190045ba3f,0,dsq56ce7ba4aab44d1400b887ed,0
3006,qsq5728a8e0ff5b5019007da3c4,0,dsq5728a8e0ff5b5019007da3c4,1
3007,qsq5728a8e0ff5b5019007da3c4,0,dsq57268d7ef1498d1400e8e392,0
3008,qsq572a14abaf94a219006aa7bc,0,dsq572a14abaf94a219006aa7bb,1


In [145]:
len(clear_qpp_qrels.qid.unique()), clear_qpp_qrels.qid.value_counts().mean(), clear_qpp_qrels.qid.value_counts().max(), clear_qpp_qrels.qid.value_counts().min()

(1500, 2.006666666666667, 3, 2)

In [149]:
clear_qpp_qrels.score.value_counts()

score
0    1500
2     616
3     506
1     388
Name: count, dtype: int64

In [151]:
clear_qpp_qrels.score.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

score
0    49.8%
2    20.5%
3    16.8%
1    12.9%
Name: proportion, dtype: object

In [1]:
## creating folds for cross-validation

import pandas as pd

df = pd.read_csv("./datasets/clear_qpp/queries/queries.tsv", sep="\t", header=None).sample(frac=1, random_state=123).reset_index(drop=True)
df.columns = ["qid", "qtext"]
foldid2qid={1: [],
         2: [],
         3: [],
         4: [],
         5: []}

counter = 0
for _, row in df.iterrows():
    foldid2qid[(counter%5)+1].append(row["qid"])
    counter+=1

foldid2qid

{1: ['qcl5310',
  'qcl7019',
  'qcl4360',
  'qcl5935',
  'qsq570e1f340dc6ce1900204dcb',
  'qcl3263',
  'qsq5726e3c4dd62a815002e9408',
  'qcl7276',
  'qcl2683',
  'qsq570b66566b8089140040f92f',
  'qcl3669',
  'qcl7394',
  'qcl1990',
  'qcl2840',
  'qsq56dfba0f231d4119001abd20',
  'qcl5772',
  'qcl7179',
  'qsq5725077c0ba9f01400d97c37',
  'qsq572e9a4edfa6aa1500f8d1e4',
  'qsq573058df069b531400832097',
  'qsq56e427248c00841900fbaeff',
  'qsq56e75cd337bdd419002c3eec',
  'qcl5018',
  'qcl7404',
  'qcl5492',
  'qcl5052',
  'qsq56dc63c214d3a41400c26845',
  'qcl5566',
  'qsq572959183f37b31900478298',
  'qcl730',
  'qsq5728349dff5b5019007d9efe',
  'qcl1246',
  'qsq570ce1a4b3d812140066d2d8',
  'qcl4969',
  'qsq570c50afb3d812140066d0c8',
  'qsq57306cfb2461fd1900a9ce04',
  'qcl6563',
  'qcl2075',
  'qcl6188',
  'qsq5726e06df1498d1400e8ee55',
  'qcl2215',
  'qsq5726712af1498d1400e8dfc2',
  'qsq570e7b8c0b85d914000d7f37',
  'qcl7263',
  'qsq56df5a5d96943c1400a5d407',
  'qsq572994ff1d0469140077955e',


# Generating qrels

## Binarising topical relevance

In [1]:
import pandas as pd

qrel_relevance = pd.read_csv("./datasets/kid-friend-en/qrels/qrels-relevance.txt", header = None, sep=" ")
qrel_relevance.columns = ["qnum", "q0", "docid", "relevance"]
qrel_relevance["relevance"] = [int(row["relevance"]>0) for _, row in qrel_relevance.iterrows()]
qrel_relevance.drop_duplicates(["qnum","docid"],inplace=True)
qrel_relevance.to_csv("./datasets/kid-friend-en/qrels/qrels-topical-relevance.tsv", header=None, sep="\t", index=None)

## Readability: Spache-allen

In [7]:
import json
import pandas as pd
import textstat
from spacheallen.formula import spache_allen

lines = []
corpus_file_path = './datasets/kid-friend-en/corpus/documents.jsonl'
with open(corpus_file_path) as f:
    lines = f.read().splitlines()

line_dicts = [json.loads(line) for line in lines]
corpus_file = pd.DataFrame(line_dicts)
# corpus_file["dale_chall"] = [round(textstat.dale_chall_readability_score(text),2) for text in corpus_file["contents"]]
# corpus_file["fkgl"] = [round(textstat.flesch_kincaid_grade(text),2) for text in corpus_file["contents"]]
# corpus_file["readability_consensus"] = [textstat.text_standard(text,float_output=False) for text in corpus_file["contents"]]
corpus_file["spache-allen"] = [int(spache_allen(text)) for text in corpus_file["contents"]]

output_path = "./playground_results/kid-friend-en-corpus.jsonl"
print(len(line_dicts), len(corpus_file))
with open(output_path, "w") as f:
    f.write(corpus_file.to_json(orient='records', lines=True, force_ascii=False))

corpus_file.to_csv("./playground_results/kid-friend-en-corpus.tsv", index=False, sep='\t')
corpus_file.head(2)

2385 2385


Unnamed: 0,id,contents,spache-allen
0,6e421f1539b1457b853712d81be87743,BTS (band) - Wikipedia. WEBBTS (also Bangtan B...,6
1,1539268e3f1d41c9abcaa277b23f51d9,Before you go on to YouTube. WEBMusic video by...,6


In [None]:
import pandas as pd

corpus_file = pd.read_csv("./playground_results/kid-friend-en-corpus.tsv", sep='\t')
corpus_file.head()

In [None]:
# print("dale chall: ", corpus_file["dale_chall"].median())
# print("fkgl: ", corpus_file["fkgl"].median())
print("spache-allen: ", corpus_file["spache-allen"].median())
# corpus_file["readability_consensus"].value_counts()

len(corpus_file), len(corpus_file.loc[corpus_file["spache-allen"]<5])

In [None]:
corpus_file.columns

In [None]:
int(corpus_file.loc[corpus_file["id"]=="77cdd0f6c3b04b5bbe79fac37c2ae63b"]["spache-allen"].values[0] < 5)

In [None]:
import pandas as pd

qrel_relevance = pd.read_csv("./datasets/kid-friend-en/qrels/qrels-relevance.txt", header = None, sep=" ")
qrel_relevance["readability"] = [int(corpus_file.loc[corpus_file["id"]==docid]["spache-allen"].values[0]<=5) for docid in qrel_relevance.docid]

temp = []
for name, group in qrel_relevance.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

df_old = pd.DataFrame(temp, columns=["qnum", "relevant_doc_distribution"])

extended_rel = []

for _, row in qrel_relevance.iterrows():
    if row["relevance"] > 0 and row["readability"] == 1:
        extended_rel.append(2)
    elif row["relevance"] > 0 and row["readability"] == 0:
        extended_rel.append(1)
    else:
        extended_rel.append(0)

qrel_relevance["readability_relevance"] = extended_rel


len(qrel_relevance.loc[qrel_relevance["readability_relevance"]==1])

qrel_relevance.drop(columns=["relevance","readability"], inplace=True)
qrel_relevance.rename(columns={"readability_relevance": "relevance"}, inplace=True)
qrel_relevance.drop_duplicates(["qnum","docid"],inplace=True)

temp = []
for name, group in qrel_relevance.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

qrel_relevance.to_csv("./datasets/kid-friend-en/qrels/qrels-readability-relevance.tsv", header=None, sep="\t", index=None)

In [None]:
df = pd.DataFrame(temp, columns=["qnum", "readability_relevant_doc_distribution"])
df = df_old.merge(df, on='qnum')
df

## Educational content: FineWeb-edu

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/fineweb-edu-classifier")
model = AutoModelForSequenceClassification.from_pretrained("HuggingFaceTB/fineweb-edu-classifier")

def edu_value(text):
  # text = "This is a test sentence."
  inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
  outputs = model(**inputs)
  logits = outputs.logits.squeeze(-1).float().detach().numpy()
  score = logits.item()
  result = {
      "text": text,
      "score": score,
      "int_score": int(round(max(0, min(score, 5)))),
  }

  return int(result["int_score"]>=3)

In [None]:
from tqdm.notebook import tqdm
import pandas as pd

df = pd.read_csv("./playground_results/kid-friend-en-corpus.tsv", sep="\t")

edu_value_results = []

for _, row in tqdm(df.iterrows() , total=len(df)):
  edu_val = edu_value(row["contents"])
  edu_value_results.append(edu_val)

df["edu_val"] = edu_value_results
df.head(5)

In [None]:
df.to_csv("./playground_results/kid-friend-en-corpus-extended.tsv", sep="\t", index=None)

In [None]:
import pandas as pd

corpus = pd.read_csv("./playground_results/kid-friend-en-corpus-extended.tsv", sep="\t")
qrel_topical_rel = pd.read_csv("./datasets/kid-friend-en/qrels/qrels-topical-relevance.tsv", sep="\t", header=None)
qrel_topical_rel.columns = ["qnum", "q0", "docid", "relevance"]

qrel_topical_rel["edu_rel"] = [corpus.loc[corpus["id"]==docid]["edu_val"].values[0] for docid in qrel_topical_rel.docid]
qrel_topical_rel.head(3)

Unnamed: 0,qnum,q0,docid,relevance,edu_rel
0,1,0,77cdd0f6c3b04b5bbe79fac37c2ae63b,0,0
1,1,0,c8a2aa042145402a8c180ca6f467c575,1,0
2,1,0,d82e5e4f531841d49bcd6fe7bd8cdf8f,0,0


In [32]:
temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

df_old = pd.DataFrame(temp, columns=["qnum", "relevant_doc_distribution"])

In [33]:
extended_rel = []

for _, row in qrel_topical_rel.iterrows():
    if row["edu_rel"] == 1 and row["relevance"] == 1:
        extended_rel.append(2)
    elif row["relevance"] == 1:
        extended_rel.append(1)
    else:
        extended_rel.append(0)
    
qrel_topical_rel["edu_relevance"] = extended_rel


len(qrel_topical_rel.loc[qrel_topical_rel["edu_relevance"]==1])

qrel_topical_rel.drop(columns=["relevance","edu_rel"], inplace=True)
qrel_topical_rel.rename(columns={"edu_relevance": "relevance"}, inplace=True)
qrel_topical_rel.drop_duplicates(["qnum","docid"],inplace=True)

temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

qrel_topical_rel.to_csv("./datasets/kid-friend-en/qrels/qrels-eduval-relevance.tsv", header=None, sep="\t", index=None)
qrel_topical_rel.head(3)

Unnamed: 0,qnum,q0,docid,relevance
0,1,0,77cdd0f6c3b04b5bbe79fac37c2ae63b,0
1,1,0,c8a2aa042145402a8c180ca6f467c575,1
2,1,0,d82e5e4f531841d49bcd6fe7bd8cdf8f,0


In [34]:
temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

df = pd.DataFrame(temp, columns=["qnum", "eduval_relevant_doc_distribution"])
df = df_old.merge(df, on='qnum')
df

Unnamed: 0,qnum,relevant_doc_distribution,eduval_relevant_doc_distribution
0,1,"{0: 13, 1: 23}","{0: 13, 1: 22, 2: 1}"
1,2,"{0: 20, 1: 3}","{0: 20, 1: 3}"
2,3,"{0: 13, 1: 12}","{0: 13, 1: 12}"
3,4,"{0: 13, 1: 12}","{0: 13, 1: 12}"
4,5,"{0: 13, 1: 35}","{0: 13, 1: 33, 2: 2}"
5,6,"{0: 2, 1: 8}","{0: 2, 1: 8}"
6,7,"{0: 22, 1: 16}","{0: 22, 1: 14, 2: 2}"
7,8,"{0: 12, 1: 8}","{0: 12, 1: 8}"
8,9,"{0: 20, 1: 37}","{0: 20, 1: 35, 2: 2}"
9,10,"{0: 15, 1: 13}","{0: 15, 1: 12, 2: 1}"


## Extended relevance (topical + readability + educational content)

In [None]:
import pandas as pd

corpus = pd.read_csv("./playground_results/kid-friend-en-corpus-extended.tsv", sep="\t")
qrel_topical_rel = pd.read_csv("./datasets/kid-friend-en/qrels/qrels-topical-relevance.tsv", sep="\t", header=None)
qrel_topical_rel.columns = ["qnum", "q0", "docid", "relevance"]

qrel_topical_rel["edu_rel"] = [corpus.loc[corpus["id"]==docid]["edu_val"].values[0] for docid in qrel_topical_rel.docid]
qrel_topical_rel["readability"] = [int(corpus.loc[corpus["id"]==docid]["spache-allen"].values[0]<=5) for docid in qrel_topical_rel.docid]
qrel_topical_rel.head(3)

Unnamed: 0,qnum,q0,docid,relevance,edu_rel,readability
0,1,0,77cdd0f6c3b04b5bbe79fac37c2ae63b,0,0,1
1,1,0,c8a2aa042145402a8c180ca6f467c575,1,0,0
2,1,0,d82e5e4f531841d49bcd6fe7bd8cdf8f,0,0,1


In [36]:
temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

df_old = pd.DataFrame(temp, columns=["qnum", "relevant_doc_distribution"])

In [37]:
extended_rel = []

for _, row in qrel_topical_rel.iterrows():
    if row["edu_rel"] == 1 and row["relevance"] == 1 and row["readability"]==1:
        extended_rel.append(3)
    elif (row["edu_rel"] == 1 and row["relevance"] == 1 and row["readability"] == 0) or (row["edu_rel"] == 1 and row["relevance"] == 0 and row["readability"] == 1):
        extended_rel.append(2)
    elif row["relevance"] == 1:
        extended_rel.append(1)
    else:
        extended_rel.append(0)
    
qrel_topical_rel["extended_relevance"] = extended_rel


len(qrel_topical_rel.loc[qrel_topical_rel["extended_relevance"]==1])

qrel_topical_rel.drop(columns=["relevance","edu_rel", "readability"], inplace=True)
qrel_topical_rel.rename(columns={"extended_relevance": "relevance"}, inplace=True)
qrel_topical_rel.drop_duplicates(["qnum","docid"],inplace=True)

temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

qrel_topical_rel.to_csv("./datasets/kid-friend-en/qrels/qrels-extended-relevance.tsv", header=None, sep="\t", index=None)
qrel_topical_rel.head(3)

Unnamed: 0,qnum,q0,docid,relevance
0,1,0,77cdd0f6c3b04b5bbe79fac37c2ae63b,0
1,1,0,c8a2aa042145402a8c180ca6f467c575,1
2,1,0,d82e5e4f531841d49bcd6fe7bd8cdf8f,0


In [38]:
temp = []
for name, group in qrel_topical_rel.groupby(by="qnum"):
    temp.append([name, group["relevance"].value_counts().sort_index().to_dict()])

df = pd.DataFrame(temp, columns=["qnum", "extended_relevant_doc_distribution"])
df = df_old.merge(df, on='qnum')
df

Unnamed: 0,qnum,relevant_doc_distribution,extended_relevant_doc_distribution
0,1,"{0: 13, 1: 23}","{0: 13, 1: 22, 2: 1}"
1,2,"{0: 20, 1: 3}","{0: 20, 1: 3}"
2,3,"{0: 13, 1: 12}","{0: 13, 1: 12}"
3,4,"{0: 13, 1: 12}","{0: 13, 1: 12}"
4,5,"{0: 13, 1: 35}","{0: 13, 1: 33, 2: 1, 3: 1}"
5,6,"{0: 2, 1: 8}","{0: 2, 1: 8}"
6,7,"{0: 22, 1: 16}","{0: 22, 1: 14, 2: 1, 3: 1}"
7,8,"{0: 12, 1: 8}","{0: 12, 1: 8}"
8,9,"{0: 20, 1: 37}","{0: 20, 1: 35, 2: 1, 3: 1}"
9,10,"{0: 15, 1: 13}","{0: 15, 1: 12, 3: 1}"
