In [1]:
import xml.etree.ElementTree as xml
from glob import glob
import os
from tqdm import tqdm
import html
import re

In [2]:
raw_data_path = r"../data/raw/*"
path = glob(raw_data_path)

## Preprocessing functions

In [3]:
def text_preprocessing(text):
    # Remove unhandled unicode chars (#1231)
    text = re.sub(r"( #\d*;)",
              lambda m: html.unescape('&' +m.group(1)[1:]),
              text.rstrip())

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

## Method 1: Combine user posts in single `.txt` file

In [None]:

processed_data_path = '../data/processed'
if not os.path.isdir(processed_data_path):
    os.mkdir(processed_data_path)

In [5]:
for p in tqdm(path):
    subject = p.split('/')[-1][:-4] + ".txt"
    # print(subject)
    subject_path = os.path.join(processed_data_path, subject)
    if os.path.isfile(subject_path):
        continue
    root = xml.parse(p).getroot()
    for text in root.findall("WRITING/TEXT"):
        with open(subject_path, 'a') as f:
            if text.text is not None:
                f.write(text.text)
            else:
                f.write(" ")
            f.write("\n")

100%|██████████| 2348/2348 [00:00<00:00, 6173.94it/s]


In [34]:
# path_labels = '../data/risk_golden_truth.txt'
# labels = {}
# with open(path_labels, 'r') as f:
#     for line in f:
#         subject, label = line.split()
#         labels[subject] = int(label)
# labels


## Method 2: Each user's post becomes a user_X

In [3]:
split_data_path = '../data/split'
if not os.path.isdir(split_data_path):
    os.mkdir(split_data_path)

In [4]:
path_labels = os.path.join('../data/', "risk_golden_truth.txt")
users = []
with open(path_labels, 'r') as f:
    for line in f:
        subject, label = line.split()
        users.append((subject, label))

In [5]:
ground_truth_subtexts = os.path.join('../data/', 'risk_golden_truth_split.txt')
for subject, label in tqdm(users):
    user_file = os.path.join('../data', 'raw', subject+'.xml')

    root = xml.parse(user_file).getroot()
    for i, text in enumerate(root.findall("WRITING/TEXT")):
        subject_subtext_path = os.path.join(split_data_path, f"{subject}_{i}.txt")
        # xml to txt
        if text.text is None:
            continue
        with open(subject_subtext_path, 'w') as f:
            f.write(text.text)
        # generate labels
        with open(ground_truth_subtexts, 'a') as f:
            f.write(f'{subject}_{i} {label}\n')


100%|██████████| 2348/2348 [01:42<00:00, 22.88it/s]


## Method 3: Text chunking

In [14]:
chunk_data_path = '../data/chunked'
if not os.path.isdir(chunk_data_path):
    os.mkdir(chunk_data_path)

In [15]:
MAX_LEN = 512


In [16]:
path_labels = os.path.join('../data/', "risk_golden_truth.txt")
users = []
with open(path_labels, 'r') as f:
    for line in f:
        subject, label = line.split()
        users.append((subject, label))

In [17]:
ones = sum([int(x) for y, x in users])
print("1:", ones)
print("0:", len(users) - ones)


1: 164
0: 2184


In [18]:
ground_truth_chunks = os.path.join('../data/', 'risk_golden_truth_chunks.txt')
for subject, label in tqdm(users):
    if label == '1':
        CHUNK_DELAY = 3
    if label == '0':
        CHUNK_DELAY = 5
    # print(subject)
    # subject = 'subject4931'
    user_file = os.path.join('../data', 'raw', subject + '.xml')

    root = xml.parse(user_file).getroot()
    sample_length = 0
    texts = []
    current = ""
    for i, text in enumerate(root.findall("WRITING/TEXT")):
        if text.text is None:
            continue
        p_text = text_preprocessing(text.text)
        texts.append((p_text, len(p_text.split())))

    chunks = []

    while texts:
        # print(len(texts))
        text_iter = iter(texts)
        current_text = next(text_iter)
        sub_chunk = ''
        passed = 0

        while True:
            current_subchunk_size = len(sub_chunk.split())
            # print(current_subchunk_size)
            # print(current_text[1])
            if current_subchunk_size + current_text[1] > MAX_LEN:
                chunks.append(sub_chunk)
                break
            else:
                sub_chunk += " " + current_text[0]
                passed += 1
            try:
                current_text = next(text_iter)
            except StopIteration:
                chunks.append(sub_chunk)
                break
    
        texts = texts[min(passed, CHUNK_DELAY):]

    for i, chunk in enumerate(chunks):
        subject_subtext_path = os.path.join(chunk_data_path, f"{subject}_{i}.txt")
        with open(subject_subtext_path, 'w') as f:
            f.write(chunk)
        # # generate labels
        with open(ground_truth_chunks, 'a') as f:
            f.write(f'{subject}_{i} {label}\n')

  0%|          | 0/2348 [00:00<?, ?it/s]

In [None]:
ground_truth_chunks = os.path.join('../data/', "risk_golden_truth_chunks.txt")
chunk_users = []
with open(ground_truth_chunks, 'r') as f:
    for line in f:
        subject, label = line.split()
        chunk_users.append((subject, label))

In [None]:
ones = sum([int(x) for y, x in chunk_users])
print("1:", ones)
print("0:", len(chunk_users) - ones)


1: 17471
0: 159766
