# Preprocess Comparison Data

In [None]:
import pandas as pd
import os
import spacy
nlp = spacy.load("en_core_web_md")

comp_data_dir = "../../data/trec_covid_topic_modelling"
f_comp_path = os.path.join(comp_data_dir, "abcnews-date-text.csv")
f_comp_out_path = os.path.join(comp_data_dir, "abcnews-date-text.csv.tokenized.txt")

data = pd.read_csv(f_comp_path, error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

def preprocess_simple(texts):
    results = []
    for doc in nlp.pipe(texts):
        result = []
        results.append(result)
        for token in doc:
            if not token.is_stop and not token.is_punct and len(token.text) > 3:
                if token.like_num:
                    result.append("<NUM>")
                else:
                    result.append(token.lemma_.lower())
    return results

processed_docs_news = preprocess_simple(documents['headline_text'])

with open(f_comp_out_path, "w") as fp:
    for idx, doc in enumerate(processed_docs_news):
        doc_text = " ".join(doc)
        fp.write(f'{idx},"{doc_text}"\n')

# Create DTC Evolving Dataset Split

In [2]:
import pickle
import csv
import os
import sys
csv.field_size_limit(sys.maxsize)

data_dir = "/home/tfink/data/kodicare/trec-covid/"
f_dtc_uid_path = os.path.join(data_dir, "TREC-COVID_dtc_evolving_corduids.pkl")
f_trec_covid_path = os.path.join(data_dir, "TREC-COVID_complete_content.csv.tokenized.txt")
f_out_path = os.path.join(data_dir, "dtc_evolving")

with open(f_dtc_uid_path, "rb") as fp:
    split_datasets = pickle.load(fp)


s0 = set(split_datasets[0])
s1 = set(split_datasets[1])
s_x = set(split_datasets[10])
print(len(s0))
print(len(s1))
print(len(s_x))
print(len(s0 & s1))
print(len(s0 & s_x))

29488
29488
29488
26540
8


In [3]:
documents = {}
with open(f_trec_covid_path, "r") as fp:
    reader = csv.reader(fp, delimiter=",", quotechar='"')
    for line in reader:
        cord_uid, doc_text = line
        if cord_uid in documents:
            if len(documents[cord_uid]) > len(doc_text):
                continue
        documents[cord_uid] = doc_text

In [7]:
last_ids = set()

# for now, only save datasets that do not overlap at all
for idx, ds in enumerate(split_datasets):
    if len(last_ids & set(ds)) > 0:
        print(f"Skipping ds {idx}")
        continue
    print(f"Saving ds {idx}")
    last_ids = set(ds)
    with open(os.path.join(f_out_path, f"{idx}.csv"), "w") as fp:
        for cord_uid in ds:
            doc_text = documents[cord_uid]
            fp.write(f'{cord_uid},"{doc_text}"\n')

Saving ds 0
Skipping ds 1
Skipping ds 2
Skipping ds 3
Skipping ds 4
Skipping ds 5
Skipping ds 6
Skipping ds 7
Skipping ds 8
Skipping ds 9
Skipping ds 10
Saving ds 11
Skipping ds 12
Skipping ds 13
Skipping ds 14
Skipping ds 15
Skipping ds 16
Skipping ds 17
Skipping ds 18
Skipping ds 19
Skipping ds 20
Skipping ds 21
Saving ds 22
Skipping ds 23
Skipping ds 24
Skipping ds 25
Skipping ds 26
Skipping ds 27
Skipping ds 28
Skipping ds 29
Skipping ds 30
Skipping ds 31
Skipping ds 32
Saving ds 33
Skipping ds 34
Skipping ds 35
Skipping ds 36
Skipping ds 37
Skipping ds 38
Skipping ds 39
Skipping ds 40
