In [11]:
import ir_datasets
import pandas as pd
import os
from tqdm import tqdm
import shutil
import time,json,jsonlines
import xml.etree.ElementTree as ET

In [2]:
#WapoV4targzPath = "Your path"
WapoV4targzPath = "/home/jovyan/shared/Datasets/WashingtonPostData/WashingtonPost.v4.tar.gz"

In [5]:
def rename(WapoV4targzPath):
    os.system("cp %s ./Data/"%WapoV4targzPath)
    print("Waiting for the untar...")
    state = os.system("tar xf ./Data/WashingtonPost.v4.tar.gz -C ./Data")
    if state != 0:
        print("Something Wrong of untar.")
        return
    filedir = "./Data/WashingtonPost.v4/data/"
    state = os.system("mv %sTREC_Washington_Post_collection.v4.jl %sTREC_Washington_Post_collection.v2.jl"%(filedir,filedir))
    if state != 0:
        print("Something Wrong of mv rename jsonl.")
        return
    time.sleep(5)
    state = os.system("mv ./Data/WashingtonPost.v4/ ./Data/WashingtonPost.v2/")
    if state != 0:
        print("Something Wrong of mv rename folder.")
        return
    
    print("Waiting for the rename...")
    state = os.system("cd Data && tar -czvf WashingtonPost.v2.tar.gz WashingtonPost.v2")
    if state != 0:
        print("Something Wrong of re-tar.")
        return
    try:
        shutil.rmtree("./Data/WashingtonPost.v2/")
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))
    os.system("rm ./Data/WashingtonPost.v4.tar.gz")
    print("File renamed successfully")
    
    # The source file required is WashingtonPost.v2.tar.gz.
    # ir_datasets expects the above file to be copied/linked under ~/.ir_datasets/wapo/WashingtonPost.v2.tar.gz.
    status = os.system("cp ./Data/WashingtonPost.v2.tar.gz ~/.ir_datasets/wapo/WashingtonPost.v2.tar.gz")
    if status == 0:
        print("File has been added in ir-dataset directory")

# The current ir-dataset library doesn't support for wapov4
# The most convinient way is to change wapov4's name to wapov2
# https://ir-datasets.com/wapo.html
rename(WapoV4targzPath)

Waiting for the untar...
Waiting for the rename...
File renamed successfully
File has been added in ir-dataset directory


In [8]:
def consdict(WapoTuple):
    DocDic = {}
    DocDic['doc_id'] = WapoTuple[0]
    DocDic['url'] = WapoTuple[1]
    DocDic['title'] = WapoTuple[2]
    DocDic['author'] = WapoTuple[3]
    DocDic['published_date'] = WapoTuple[4]
    DocDic['kicker'] = WapoTuple[5]
    DocDic['body'] = WapoTuple[6]
    DocDic['body_paras_html'] = WapoTuple[7]
    DocDic['body_media'] = WapoTuple[8]
    return DocDic

In [9]:
outputpath = './Data/WapoV4_irdataset/'
# Load dataset
dataset = ir_datasets.load("wapo/v2")

def ConvertToCsv(outputpath,dataset):
    tmppath = "./Data/tmp/"
    os.mkdir(tmppath)
    DocList = []
    DocCount = 0
    for doc in dataset.docs_iter():
        DocDic = consdict(doc)
        dumplist = ["Opinion","Opinions","Letters to the Editor","The Post’s View"]
        if DocDic['kicker'] not in dumplist:
            DocList.append(DocDic)
            DocCount += 1
        if DocCount % 200==0:
            df = pd.DataFrame(DocList)
            df.to_csv(tmppath+'Part'+str(DocCount//200)+'.csv', index=False)
            DocList = []
    df = pd.DataFrame(DocList)
    df.to_csv(tmppath+'Part'+str(DocCount//200+1)+'.csv', index=False)
    print("We got %s files, %s documents!"%(str(DocCount//200+1),str(DocCount)))
    
    files = os.listdir(tmppath)
    try:
        files.remove('.ipynb_checkpoints')
    except:
        pass

    for file in tqdm(files):
        try:
            temp = pd.read_csv(tmppath+file)
        except:
            continue
        temp.rename(columns={"doc_id":"id"},inplace=True)
        temp.to_csv(outputpath+file,index=False)
    shutil.rmtree("./Data/tmp/")
    print("Precessing succeeded!")

In [10]:
ConvertToCsv(outputpath,dataset)

[INFO] [starting] building docstore
docs_iter: 728626doc [16:37, 730.43doc/s] 
[INFO] [finished] docs_iter: [16:37] [728626doc] [730.43doc/s]
[INFO] [finished] building docstore [16:38]
  0%|          | 2/3371 [00:00<03:32, 15.83it/s]

We got 3371 files, 674050 documents!


100%|██████████| 3371/3371 [03:27<00:00, 16.22it/s]

Precessing succeeded!





In [1]:
#WapoV3targzPath = "Your path"
WapoV3targzPath = '/home/jovyan/shared/Datasets/WashingtonPostData/WashingtonPost.v3/data/TREC_Washington_Post_collection.v3.jl'
v3outputpath='./Data/WapoV3/'

In [3]:
def ConsDic_V3(jsonfile, dictionary):
    dictionary['id'] = jsonfile['id']
    dictionary['title'] = jsonfile['title']
    dictionary['author'] = jsonfile['author']
    
    if 'article_url' in jsonfile:
        dictionary['url'] = jsonfile['article_url']
    else:
        dictionary['url'] = None
 
    if 'published_date' in jsonfile:
        dictionary['published_date'] = jsonfile['published_date']
    elif 'publish_date' in jsonfile:
        dictionary['published_date'] = jsonfile['publish_date']
    else:
        dictionary['published_date'] = None
    
    if 'type' in jsonfile:   
        dictionary['type'] = jsonfile['type']
    else:
        dictionary['type'] = None
    
    if 'source' in jsonfile:
        dictionary['source'] = jsonfile['source']
    else:
        dictionary['source'] = None
    
    content = ''
    for i in range(len(jsonfile['contents'])):
        if jsonfile['contents'][i] != None:
            try:
                content+=str(jsonfile['contents'][i]['content'])
            except KeyError:
                pass
    dictionary['content'] = content
    
    return dictionary

In [10]:
def ConvertToCsv_v3(WapoV3targzPath,v3outputpath):
    with open(WapoV3targzPath,'r',encoding='UTF-8') as f:
        DocList = []
        DocCount = 0
        for line in tqdm(f):
            doc = json.loads(line)
            DocDic = {}

            DocDic = ConsDic_V3(doc,DocDic)

            DocList.append(DocDic)
            DocCount += 1
            if DocCount % 2000==0:
                df = pd.DataFrame(DocList)
                df.to_csv(v3outputpath+'Part'+str(DocCount//2000)+'.csv', index=False)
                DocList = []
        df = pd.DataFrame(DocList)
        df.to_csv(v3outputpath+'Part'+str(DocCount//2000+1)+'.csv', index=False)
        print("We got %s files, %s documents!"%(str(DocCount//2000+1),str(DocCount)))
        
ConvertToCsv_v3(WapoV3targzPath,v3outputpath)

671947it [03:48, 2945.86it/s]


We got 336 files, 671947 documents!


In [51]:
# reformat topic file
# append doc information
# This step should after document expansion

path_to_topic2021 = './Data/Topic/trec2021/newsir21-topics.txt'
path_to_topic2020 = './Data/Topic/trec2020/newsir20-topics.txt'

def mergedoc_queries(path_to_topic,version):
    #replace with your jsonl file
    washingtonpath = '~/TREC_Washington_Post_collection.v4.jl'
    
    with open(path_to_topic,"r+") as f:
        content = f.read()
        content = content.replace("<data>","")
        content = content.replace("</data>","")
        f.seek(0,0)
        f.write("<data>\n" + content+ "\n</data>")
    
    tree = ET.parse(path_to_topic)
    root = tree.getroot()
    
    print("There are ",len(root)," topics.")
    
    if version == "2021":
        num, docid, url, title, desc, narr, subtopics = [],[],[],[],[],[],[]
        for i in range(len(root)):
            sub = []
            num.append(root[i][0].text.strip())
            docid.append(root[i][1].text.strip())
            url.append(root[i][2].text.strip())
            title.append(root[i][3].text.strip())
            desc.append(root[i][4].text.strip())
            narr.append(root[i][5].text.strip())
            for j in range(len(root[i][6])):
                sub.append(root[i][6][j].text.strip())
            subtopics.append(sub)
        
        dict = {'num': num, 'docid': docid, 'url':url, 'title':title, 'desc': desc, 'narr': narr, 'subtopics': subtopics} 
        df = pd.DataFrame(dict)
        
        with open(washingtonpath,'r',encoding='UTF-8') as f:
            DocList = []
            DocCount = 0
            for line in tqdm(f):
                doc = json.loads(line)
                DocDic = {}
                DocDic = ConsDic_V3(doc,DocDic)
                if DocDic['id'] in df['docid'].values:
                    DocList.append(DocDic)
                    DocCount += 1
        df2 = pd.DataFrame(DocList)
        df2.rename(columns={'content': 'qcontent','id':'docid'},inplace=True)
        query_df = pd.merge(df,df2,on='docid')
        print("We merged %s topics."%DocCount)
        
        files = os.listdir('./Data/WapoV4_expanded/')
        
        try:
            files.remove('.ipynb_checkpoints')
        except:
            pass
        docidlist = query_df['docid'].tolist()
        query_df['Query'] = ''
        query_df['Key_Words'] = ''
        for f in tqdm(files):
            temp = pd.read_csv('./Data/WapoV4_expanded/'+f)
            for i in range(len(temp)):
                if temp['id'][i] in docidlist:
                    query_df['Query'].loc[query_df['docid']==temp['id'][i]] = temp['Query'][i]
                    query_df['Key_Words'].loc[query_df['docid']==temp['id'][i]] = temp['Key_Words'][i]
        print("expand extracted keywords and predicted queries to topic.")
        query_df.to_csv("./Data/Topic/trec2021/topicquery.csv",index=False)
        print("Done")
    
    elif version=="2020":
        num, docid, url = [],[],[]
        for i in range(len(root)):
            num.append(root[i][0].text.strip())
            docid.append(root[i][1].text.strip())
            url.append(root[i][2].text.strip())
        topicdict = {'num': num, 'docid': docid, 'url':url} 
        df = pd.DataFrame(topicdict)
  
        with open(washingtonpath,'r',encoding='UTF-8') as f:
            DocList = []
            DocCount = 0
            for line in f:
                doc = json.loads(line)
                DocDic = {}
                DocDic = ConsDic_V3(doc,DocDic)
                if DocDic['id'] in df['docid'].values:
                    DocList.append(DocDic)
                    DocCount += 1
        df2 = pd.DataFrame(DocList)
        print("We merged %s topics."%DocCount)
        df2.rename(columns={'id': 'docid'},inplace=True)
        query_df = pd.merge(df,df2,on='docid')
        
        files = os.listdir('./Data/WapoV4_expanded/')
        
        try:
            files.remove('.ipynb_checkpoints')
        except:
            pass
        docidlist = query_df['docid'].tolist()
        query_df['Query'] = ''
        query_df['Key_Words'] = ''
        for f in tqdm(files):
            temp = pd.read_csv('./Data/WapoV4_expanded/'+f)
            for i in range(len(temp)):
                if temp['id'][i] in docidlist:
                    query_df['Query'].loc[query_df['docid']==temp['id'][i]] = temp['Query'][i]
                    query_df['Key_Words'].loc[query_df['docid']==temp['id'][i]] = temp['Key_Words'][i]
        print("expand extracted keywords and predicted queries to topic.")
        query_df.to_csv("./Data/Topic/trec2020/topicquery.csv",index=False)
        print("Done")

In [53]:
mergedoc_queries(path_to_topic2021,"2021")

735it [00:00, 7348.52it/s]

There are  51  topics.


728626it [01:33, 7797.33it/s] 
  0%|          | 7/3218 [00:00<00:50, 63.76it/s]

We merged 51 topics.


100%|██████████| 3218/3218 [00:47<00:00, 67.86it/s]

expand extracted keywords and predicted queries to topic.
Done





In [52]:
mergedoc_queries(path_to_topic2020,"2020")

There are  50  topics.


  0%|          | 7/3218 [00:00<00:50, 62.97it/s]

We merged 50 topics.


100%|██████████| 3218/3218 [00:47<00:00, 68.09it/s]

expand extracted keywords and predicted queries to topic.
Done



