## read data

In [1]:
import os

In [2]:
sources = []
with os.scandir("./data/headlines") as directory:
    for entry in directory:
        if entry.is_file():
            print(f"reading {entry.name}...")
            source_id = entry.name[0]
            with open(entry.path) as file:
                headlines = []
                for i, line in enumerate(file.readlines()):
                    headlines.append(f"({source_id}{str(i)}) {line.strip()}")
                sources.append(headlines)

reading ap.txt...
reading cnn.txt...
reading fox.txt...


## generate pairs

In [3]:
# ASYMMETRIC
# pairs = []
# for s in sources:
#     for h1 in s:
#         for h2 in s:
#             if h1 != h2:
#                 pairs.append((h1,h2))

In [4]:
# SYMMETRIC
pairs = []
for s in sources:
    for i in range(len(s)-1):
        for j in range(i+1,len(s)):
            pairs.append((s[i],s[j]))

In [5]:
import random

In [6]:
random.shuffle(pairs)

## assign labelers

In [7]:
import itertools

In [8]:
num_labelers = 5
redundancy = 3
split_pairs = [ [] for _ in range(num_labelers) ]

i = 0
count = 0
for p in itertools.cycle(pairs):
    split_pairs[i].append(p)
    i = (i+1) % num_labelers
    count += 1
    if count == len(pairs) * redundancy:
        break

## write data

In [11]:
labels_per_minute = 4
expected_time = int(len(pairs) * redundancy / num_labelers / labels_per_minute)

In [12]:
for i in range(num_labelers):
    with open(f"./data/to_label/set_{i+1}.txt", "w") as file:
        data = f"""Suppose an average adult residing in the United States is viewing news headlines.
If the subject views headline A then headline B, will headline A influence their opinion of headline B?
Pick the option that you think is most likely (Y/N).
There are a total of {len(split_pairs[i])} questions.
The task is estimated to take {expected_time} minutes to complete.

{'-'*20}

"""
        for j, line in enumerate(split_pairs[i]):
            data += f"#{j+1}\n{line[0]}\n{line[1]}\nY/N\n\n{'-'*20}\n\n"
        
        file.write(data)