In [1]:
import xml.etree.ElementTree as xml
from glob import glob
import os
from tqdm import tqdm
import re

In [2]:
raw_data_path = r"../data/raw/*"
path = glob(raw_data_path)

## Preprocessing functions

In [None]:
def text_preprocessing(text):
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text



In [None]:
def remove_whitespace(text):
   """ Removes all whitespaces from a given text."""
   return " ".join(text.split())

In [None]:
def remove_URLs(text):
   "Remove URLs from text using regular expressions."
   url_re = re.compile(r'https?://\S+|www\.\S+')
   return url_re.sub(r'', text)

In [None]:
def remove_punctuation(text):
   tokenizer = RegexpTokenizer(r'\w+')
   no_punct = tokenizer.tokenize(" ".join(text))
   return no_punct

## Method 1: Combine user posts in single `.txt` file

In [None]:

processed_data_path = '../data/processed'
if not os.path.isdir(processed_data_path):
    os.mkdir(processed_data_path)

In [5]:
for p in tqdm(path):
    subject = p.split('/')[-1][:-4] + ".txt"
    # print(subject)
    subject_path = os.path.join(processed_data_path, subject)
    if os.path.isfile(subject_path):
        continue
    root = xml.parse(p).getroot()
    for text in root.findall("WRITING/TEXT"):
        with open(subject_path, 'a') as f:
            if text.text is not None:
                f.write(text.text)
            else:
                f.write(" ")
            f.write("\n")

100%|██████████| 2348/2348 [00:00<00:00, 6173.94it/s]


In [34]:
# path_labels = '../data/risk_golden_truth.txt'
# labels = {}
# with open(path_labels, 'r') as f:
#     for line in f:
#         subject, label = line.split()
#         labels[subject] = int(label)
# labels


## Method 2: Each user's post becomes a user_X

In [3]:
raw_data_path = r"../data/raw/*"
split_data_path = '../data/split'
if not os.path.isdir(split_data_path):
    os.mkdir(split_data_path)

In [4]:
path_labels = os.path.join('../data/', "risk_golden_truth.txt")
users = []
with open(path_labels, 'r') as f:
    for line in f:
        subject, label = line.split()
        users.append((subject, label))

In [5]:
ground_truth_subtexts = os.path.join('../data/', 'risk_golden_truth_split.txt')
for subject, label in tqdm(users):
    user_file = os.path.join('../data', 'raw', subject+'.xml')

    root = xml.parse(user_file).getroot()
    for i, text in enumerate(root.findall("WRITING/TEXT")):
        subject_subtext_path = os.path.join(split_data_path, f"{subject}_{i}.txt")
        # xml to txt
        if text.text is None:
            continue
        with open(subject_subtext_path, 'w') as f:
            f.write(text.text)
        # generate labels
        with open(ground_truth_subtexts, 'a') as f:
            f.write(f'{subject}_{i} {label}\n')


100%|██████████| 2348/2348 [01:42<00:00, 22.88it/s]


## Method 3: Text chunking