In [1]:
import json
import pandas as pd

In [2]:
# parse "evidence" column
import jsonlines


df = pd.DataFrame(columns = ['id', 'claim', 'evidence'])
with open('data/claims.jsonl', 'r', encoding = 'utf-8') as f:
    for item in jsonlines.Reader(f):
        df.loc[len(df)] = item
    
df = df.set_index('id')

In [3]:
# 279 claims in total
df[df['evidence'] == {}]
# 73 claims without evidence

df['num_evidence'] = df['evidence'].map(lambda x: len(x.keys()))
df[df['num_evidence'] == 1]
# 125 claims with 1 evidence

df[df['num_evidence'] == 2].count()
# 36 claims with 2 evidences

df[df['num_evidence'] == 3].count()
# 12 claims with 3 evidences

df[df['num_evidence'] == 4].count()
# 11 claims with 4 evidences

df[df['num_evidence'] == 5].count()
# 8 claims with 5 evidences

df[df['num_evidence'] == 6].count()
# 5 claims with 6 evidences

df[df['num_evidence'] > 6]
# 9 claims with more than 6 evidences

Unnamed: 0_level_0,claim,evidence,num_evidence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
170,Bariatric surgery reduces resolution of diabetes.,"{'5824985': {'provenance': 'citation', 'label'...",9
207,CHEK2 has a significant role in breast cancer,"{'13519661': {'provenance': 'citation', 'label...",9
215,CRY proteins mediate light signal transduction.,"{'5035851': {'provenance': 'citation', 'label'...",7
451,Gene expression can be highly variable across ...,"{'12804937': {'provenance': 'citation', 'label...",11
534,Hypertension can be accurately diagnosed with ...,"{'25515907': {'provenance': 'citation', 'label...",8
599,Incidence rates of cervical cancer have increa...,"{'12779444': {'provenance': 'citation', 'label...",9
794,Mitochondria are uninvolved in processes relat...,"{'8551160': {'provenance': 'citation', 'label'...",20
872,Obesity is determined in part by genetic factors.,"{'1180972': {'provenance': 'citation', 'label'...",24
1296,There is no relation between lupus erythematos...,"{'27466734': {'provenance': 'citation', 'label...",16


In [4]:
flatten_df = pd.DataFrame(columns= ['claim', 'doc_id', 'provenance', 'label', 'sentences', 'model_ranks'])

# NEI not included
for row in df.index:
    for key in df.loc[row]['evidence']:
        flatten_df.loc[len(flatten_df)] = {'claim': df.loc[row]['claim'], 'doc_id': key, 'provenance': df.loc[row]['evidence'][key]['provenance'], 
        'label': df.loc[row]['evidence'][key]['label'], 'sentences': df.loc[row]['evidence'][key]['sentences'], 'model_ranks': df.loc[row]['evidence'][key]['model_ranks']}

flatten_df
# 460 ECAPs

Unnamed: 0,claim,doc_id,provenance,label,sentences,model_ranks
0,10-20% of people with severe mental disorder r...,6490571,citation,CONTRADICT,[7],
1,A breast cancer patient's capacity to metaboli...,24341590,citation,CONTRADICT,"[10, 11, 12, 13]",
2,A breast cancer patient's capacity to metaboli...,8553718,pooling,CONTRADICT,[2],"{'multivers_10': 1020, 'multivers_20': 3474, '..."
3,A breast cancer patient's capacity to metaboli...,24313685,pooling,CONTRADICT,[3],"{'multivers_10': 3764, 'multivers_20': 6319, '..."
4,A breast cancer patient's capacity to metaboli...,30328289,pooling,CONTRADICT,[4],"{'multivers_10': 10499, 'multivers_20': 5079, ..."
...,...,...,...,...,...,...
455,c-MYC is important for maintaining pluripotent...,2002182,pooling,SUPPORT,[],"{'multivers_10': 838, 'multivers_20': 246, 'pa..."
456,c-MYC is important for maintaining pluripotent...,21271817,pooling,SUPPORT,[],"{'multivers_10': 1219, 'multivers_20': 224, 'p..."
457,c-MYC is important for maintaining pluripotent...,27394882,pooling,SUPPORT,[],"{'multivers_10': 680, 'multivers_20': 231, 'pa..."
458,c-MYC is important for maintaining pluripotent...,11900693,pooling,SUPPORT,[],"{'multivers_10': 656, 'multivers_20': 241, 'pa..."


In [5]:
# 209 ECAPs from SciFact-Orig, 251 from pooling S2ORC
flatten_df[flatten_df['provenance'] == 'citation']



Unnamed: 0,claim,doc_id,provenance,label,sentences,model_ranks
0,10-20% of people with severe mental disorder r...,6490571,citation,CONTRADICT,[7],
1,A breast cancer patient's capacity to metaboli...,24341590,citation,CONTRADICT,"[10, 11, 12, 13]",
6,A country's Vaccine Alliance (GAVI) eligibilit...,12428497,citation,SUPPORT,[8],
7,A deficiency of folate decreases blood levels ...,11705328,citation,CONTRADICT,[4],
11,APOE4 expression in iPSC-derived neurons resul...,4709641,citation,SUPPORT,[1],
...,...,...,...,...,...,...
446,Vitamin D deficiency has no effect on the term...,2425364,citation,CONTRADICT,"[9, 10, 11, 12]",
448,Whole brain radiotherapy reduces the occurrenc...,3944632,citation,SUPPORT,"[6, 10]",
453,c-MYC is important for maintaining pluripotent...,22843616,citation,SUPPORT,[4],
454,c-MYC is important for maintaining pluripotent...,25300426,citation,SUPPORT,[4],


In [6]:
pooling_df = flatten_df[flatten_df['provenance'] == 'pooling']

pooling_df['multivers_10'] = pooling_df['model_ranks'].map(lambda x: x['multivers_10'] if 'multivers_10' in x else None)
pooling_df['multivers_20'] = pooling_df['model_ranks'].map(lambda x: x['multivers_20'] if 'multivers_20' in x else None)
pooling_df['paragraph_joint'] = pooling_df['model_ranks'].map(lambda x: x['paragraph_joint'] if 'paragraph_joint' in x else None)
pooling_df['vert5erini'] = pooling_df['model_ranks'].map(lambda x: x['vert5erini'] if 'vert5erini' in x else None)
pooling_df.fillna(10000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pooling_df['multivers_10'] = pooling_df['model_ranks'].map(lambda x: x['multivers_10'] if 'multivers_10' in x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pooling_df['multivers_20'] = pooling_df['model_ranks'].map(lambda x: x['multivers_20'] if 'multivers_20' in x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0,claim,doc_id,provenance,label,sentences,model_ranks,multivers_10,multivers_20,paragraph_joint,vert5erini
2,A breast cancer patient's capacity to metaboli...,8553718,pooling,CONTRADICT,[2],"{'multivers_10': 1020, 'multivers_20': 3474, '...",1020,3474,1250,84.0
3,A breast cancer patient's capacity to metaboli...,24313685,pooling,CONTRADICT,[3],"{'multivers_10': 3764, 'multivers_20': 6319, '...",3764,6319,4631,85.0
4,A breast cancer patient's capacity to metaboli...,30328289,pooling,CONTRADICT,[4],"{'multivers_10': 10499, 'multivers_20': 5079, ...",10499,5079,6199,203.0
5,A breast cancer patient's capacity to metaboli...,23838305,pooling,CONTRADICT,[13],"{'multivers_10': 767, 'multivers_20': 1410, 'p...",767,1410,12229,193.0
8,A deficiency of folate decreases blood levels ...,23526574,pooling,CONTRADICT,[6],"{'multivers_10': 3494, 'multivers_20': 4704, '...",3494,4704,412,108.0
...,...,...,...,...,...,...,...,...,...,...
452,Whole brain radiotherapy reduces the occurrenc...,1254092,pooling,SUPPORT,"[0, 1]","{'multivers_10': 464, 'multivers_20': 1025, 'p...",464,1025,88,511.0
455,c-MYC is important for maintaining pluripotent...,2002182,pooling,SUPPORT,[],"{'multivers_10': 838, 'multivers_20': 246, 'pa...",838,246,1262,10000.0
456,c-MYC is important for maintaining pluripotent...,21271817,pooling,SUPPORT,[],"{'multivers_10': 1219, 'multivers_20': 224, 'p...",1219,224,9433,10000.0
457,c-MYC is important for maintaining pluripotent...,27394882,pooling,SUPPORT,[],"{'multivers_10': 680, 'multivers_20': 231, 'pa...",680,231,3120,10000.0


In [7]:
# MultiVerS_10 boosts 96 ECAPs
# 46 with rank <= 100
pooling_df[pooling_df['multivers_10'] <= 100].count()

# MultiVerS_20 boosts 92 ECAPs
# 49 with rank <= 100
# pooling_df[pooling_df['multivers_20'] <= 100].count()

# paragraphjoint boosts 106 ECAPs
# 51 with rank <= 100
# pooling_df[pooling_df['paragraph_joint'] <= 100].count()

# vert5erini boosts 86 ECAPs
# 37 with rank <= 100
# pooling_df[pooling_df['vert5erini'] <= 100].count()

claim              46
doc_id             46
provenance         46
label              46
sentences          46
model_ranks        46
multivers_10       46
multivers_20       46
paragraph_joint    46
vert5erini         33
dtype: int64

In [8]:
meta_df = pd.DataFrame(columns = ['id', 'source_doc_id', 'source_metadata'])
with open('data/claims_metadata.jsonl', 'r', encoding = 'utf-8') as f:
    for item in jsonlines.Reader(f):
        meta_df.loc[len(meta_df)] = item

meta_df['arxiv_id'] = meta_df['source_metadata'].map(lambda x: x['arxiv_id'])
meta_df['pmc_id'] = meta_df['source_metadata'].map(lambda x: x['pmc_id'])
meta_df['pubmed_id'] = meta_df['source_metadata'].map(lambda x: x['pubmed_id'])
meta_df['venue'] = meta_df['source_metadata'].map(lambda x: x['venue'])
meta_df['journal'] = meta_df['source_metadata'].map(lambda x: x['journal'])

# None has arxiv id --> None published on Arxiv?
# no preprint included --> more rigorous?
meta_df[~meta_df['arxiv_id'].isnull()]

# #  documents published on PubMed
# meta_df[meta_df['pubmed_id'].isnull()].count()

# #  documents published on PMC
# meta_df[meta_df['pmc_id'].isnull()].count()

Unnamed: 0,id,source_doc_id,source_metadata,arxiv_id,pmc_id,pubmed_id,venue,journal


In [9]:

grouped_df = (meta_df.groupby('source_doc_id').count())

# reused documents (probably used to support/refute conflict claims)
len(grouped_df[grouped_df['id'] > 1])



44

In [14]:
df.to_csv("claims.csv", na_rep = 'NA')