## read data

In [30]:
import os

In [31]:
sources = []
with os.scandir("./data/2022-07/headlines") as directory:
    for entry in directory:
        if entry.is_file():
            print(f"reading {entry.name}...")
            source_id = entry.name[0]
            with open(entry.path) as file:
                headlines = []
                for i, line in enumerate(file.readlines()):
                    headlines.append(f"({source_id}{str(i)}) {line.strip()}")
                sources.append(headlines)

reading at.txt...
reading tf.txt...


## generate pairs

In [32]:
pairs = []
for s in sources:
    for i in range(len(s)-1):
        for j in range(i+1,len(s)):
            pairs.append((s[i],s[j]))

In [33]:
import random

In [34]:
random.shuffle(pairs)

## assign labelers

In [35]:
import itertools

In [36]:
num_labelers = 10
redundancy = 5
assert redundancy <= num_labelers 

task_size = len(pairs) * redundancy / num_labelers
assert task_size == int(task_size)  # if it doesn't divide evenly, need to be more careful with partitioning

split_pairs = [ [] for _ in range(num_labelers) ]

i = 0
count = 0
for p in itertools.cycle(pairs):
    assert p not in split_pairs[i]  # sanity check
    split_pairs[i].append(p)
    count += 1
    if count % task_size == 0:
        i += 1
    if count == len(pairs) * redundancy:
        break

## write data

In [37]:
labels_per_minute = 2.5
expected_time = int(task_size / labels_per_minute)

In [38]:
for i in range(num_labelers):
    with open(f"./data/to_label/set_{i+1}.txt", "w") as file:
        data = f"""Suppose an average adult residing in the United States is viewing news headlines.
If the subject views headline A and headline B together,
will their impression of either story likely be different
from what it would have been if the subject had viewed them individually?
I.e., would viewing the headline of one story influence their
opinion on the veracity of the content of the other story or
the causes, effects, or benefits of the events discussed within?

For each question, you may answer "yes", "no", or "maybe".
Please replace "Y/N/M" with the corresponding letter.

There are a total of {len(split_pairs[i])} questions.
The task is estimated to take {expected_time} minutes to complete.

{'-'*20}

"""
        for j, line in enumerate(split_pairs[i]):
            data += f"#{j+1}\n{line[0]}\n{line[1]}\nY/N/M\n\n{'-'*20}\n\n"
        
        file.write(data)