# Preprocess Comparison Data

In [1]:
import pandas as pd
import os
import spacy
import re
nlp = spacy.load("en_core_web_md")

comp_data_dir = "../../data/trec_covid_topic_modelling"
f_comp_path = os.path.join(comp_data_dir, "abcnews-date-text.csv")
f_comp_out_path = os.path.join(comp_data_dir, "abcnews-date-text.csv.tokenized_quick.txt")

data = pd.read_csv(f_comp_path, error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

def preprocess_simple(texts):
    results = []
    for doc in nlp.pipe(texts):
        result = []
        results.append(result)
        for token in doc:
            if not token.is_stop and not token.is_punct and len(token.text) > 3:
                if token.like_num:
                    result.append("<NUM>")
                else:
                    result.append(token.lemma_.lower())
    return results

clean_p = re.compile(r'[^\w\s]+|\d+', re.UNICODE)

def preprocess_quick(texts):
    results = []
    for doc in texts:
        clean_string = clean_p.sub(' ', doc)
        results.append(clean_string.lower().split())
    return results

#processed_docs_news = preprocess_simple(documents['headline_text'])
processed_docs_news = preprocess_quick(documents['headline_text'])

with open(f_comp_out_path, "w") as fp:
    for idx, doc in enumerate(processed_docs_news):
        doc_text = " ".join(doc)
        #fp.write(f'{idx},"{doc_text}"\n')
        fp.write(f'{doc_text}\n')



  data = pd.read_csv(f_comp_path, error_bad_lines=False)


# Create DTC Evolving Dataset Split

In [4]:
import pickle
import csv
import os
import sys
csv.field_size_limit(sys.maxsize)

data_dir = "/home/tfink/data/kodicare/trec-covid/"
f_dtc_uid_path = os.path.join(data_dir, "TREC-COVID_dtc_evolving_corduids.pkl")
f_trec_covid_path = os.path.join(data_dir, "TREC-COVID_complete_content.csv.tokenized_lm.txt")
f_trec_covid_abstracts_path = os.path.join(data_dir, "TREC-COVID_complete_content_abstracts.csv.tokenized_lm.txt")
f_out_path = os.path.join(data_dir, "dtc_evolving_lm")
f_abstracts_out_path = os.path.join(data_dir, "dtc_evolving_abstracts_lm")

skip_overlap = False

with open(f_dtc_uid_path, "rb") as fp:
    split_datasets = pickle.load(fp)


s0 = set(split_datasets[0])
s1 = set(split_datasets[1])
s_x = set(split_datasets[10])
print(len(s0))
print(len(s1))
print(len(s_x))
print(len(s0 & s1))
print(len(s0 & s_x))

29488
29488
29488
26540
8


In [5]:
def load_documents(path):
    documents = {}
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        for line in reader:
            cord_uid, doc_text = line
            if cord_uid in documents:
                if len(documents[cord_uid]) > len(doc_text):
                    continue
            documents[cord_uid] = doc_text
    return documents


def load_document_sentences(path):
    documents = {}
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        for line in reader:
            cord_uid, doc_text = line
            if cord_uid not in documents:
                documents[cord_uid] = []
            documents[cord_uid].append(doc_text)
    return documents


def save_split_documents(out_path, documents, split_datasets):
    last_ids = set()

    # for now, only save datasets that do not overlap at all
    for idx, ds in enumerate(split_datasets):
        if len(last_ids & set(ds)) > 0:
            print(f"Skipping ds {idx}")
            continue
        print(f"Saving ds {idx}")
        last_ids = set(ds)
        with open(os.path.join(out_path, f"{idx}.csv"), "w") as fp:
            for cord_uid in ds:
                doc_text = documents[cord_uid]
                fp.write(f'{cord_uid},"{doc_text}"\n')


def save_split_documents_lm(out_path, documents, split_datasets, skip_overlap=True):
    last_ids = set()

    # for now, only save datasets that do not overlap at all
    for idx, ds in enumerate(split_datasets):
        if skip_overlap and len(last_ids & set(ds)) > 0:
            print(f"Skipping ds {idx}")
            continue
        print(f"Saving ds {idx}")
        last_ids = set(ds)
        with open(os.path.join(out_path, f"{idx}.txt"), "w") as fp:
            for cord_uid in ds:
                sentences = documents[cord_uid]
                for sent in sentences:
                    fp.write(f'{sent}\n')

In [4]:
documents = load_document_sentences(f_trec_covid_path)
#save_split_documents(out_path=f_out_path, documents=documents, split_datasets=split_datasets)
save_split_documents_lm(out_path=f_out_path, documents=documents, split_datasets=split_datasets, skip_overlap=skip_overlap)
documents = None

Saving ds 0
Skipping ds 1
Skipping ds 2
Skipping ds 3
Skipping ds 4
Skipping ds 5
Skipping ds 6
Skipping ds 7
Skipping ds 8
Skipping ds 9
Skipping ds 10
Saving ds 11
Skipping ds 12
Skipping ds 13
Skipping ds 14
Skipping ds 15
Skipping ds 16
Skipping ds 17
Skipping ds 18
Skipping ds 19
Skipping ds 20
Skipping ds 21
Saving ds 22
Skipping ds 23
Skipping ds 24
Skipping ds 25
Skipping ds 26
Skipping ds 27
Skipping ds 28
Skipping ds 29
Skipping ds 30
Skipping ds 31
Skipping ds 32
Saving ds 33
Skipping ds 34
Skipping ds 35
Skipping ds 36
Skipping ds 37
Skipping ds 38
Skipping ds 39
Skipping ds 40


In [6]:
documents = load_document_sentences(f_trec_covid_abstracts_path)
#save_split_documents(out_path=f_abstracts_out_path, documents=documents, split_datasets=split_datasets)
save_split_documents_lm(out_path=f_abstracts_out_path, documents=documents, split_datasets=split_datasets, skip_overlap=skip_overlap)
documents = None

Saving ds 0
Saving ds 1
Saving ds 2
Saving ds 3
Saving ds 4
Saving ds 5
Saving ds 6
Saving ds 7
Saving ds 8
Saving ds 9
Saving ds 10
Saving ds 11
Saving ds 12
Saving ds 13
Saving ds 14
Saving ds 15
Saving ds 16
Saving ds 17
Saving ds 18
Saving ds 19
Saving ds 20
Saving ds 21
Saving ds 22
Saving ds 23
Saving ds 24
Saving ds 25
Saving ds 26
Saving ds 27
Saving ds 28
Saving ds 29
Saving ds 30
Saving ds 31
Saving ds 32
Saving ds 33
Saving ds 34
Saving ds 35
Saving ds 36
Saving ds 37
Saving ds 38
Saving ds 39
Saving ds 40
