In [4]:
import xml.etree.ElementTree as xml
from glob import glob
import os
from tqdm import tqdm
import html
import re
import time

In [5]:
raw_data_path = r"../data/raw/*"
path = glob(raw_data_path)

## Preprocessing functions

In [6]:
def text_preprocessing(text):
    # Remove unhandled unicode chars (#1231)
    text = re.sub(r"( #\d*;)",
              lambda m: html.unescape('&' +m.group(1)[1:]),
              text.rstrip())

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

## Method 1: Combine user posts in single `.txt` file

In [7]:

processed_data_path = '../data/processed'
if not os.path.isdir(processed_data_path):
    os.mkdir(processed_data_path)

In [8]:
for p in tqdm(path):
    subject = p.split('/')[-1][:-4] + ".txt"
    # print(subject)
    subject_path = os.path.join(processed_data_path, subject)
    if os.path.isfile(subject_path):
        continue
    root = xml.parse(p).getroot()
    for text in root.findall("WRITING/TEXT"):
        with open(subject_path, 'a') as f:
            if text.text is not None:
                f.write(text_preprocessing(text.text))
            else:
                f.write(" ")
            f.write("\n")

100%|██████████| 2348/2348 [00:49<00:00, 47.21it/s]


## Method 2: Each user's post becomes a user_X

In [9]:
split_data_path = '../data/split'
if not os.path.isdir(split_data_path):
    os.mkdir(split_data_path)

In [10]:
path_labels = os.path.join('../data/', "risk_golden_truth.txt")
users = []
with open(path_labels, 'r') as f:
    for line in f:
        subject, label = line.split()
        users.append((subject, label))

In [11]:
ground_truth_subtexts = os.path.join('../data/', 'risk_golden_truth_split.txt')
for subject, label in tqdm(users):
    user_file = os.path.join('../data', 'raw', subject+'.xml')

    root = xml.parse(user_file).getroot()
    for i, text in enumerate(root.findall("WRITING/TEXT")):
        subject_subtext_path = os.path.join(split_data_path, f"{subject}_{i}.txt")
        # xml to txt
        if text.text is None:
            continue
        with open(subject_subtext_path, 'w') as f:
            f.write(text_preprocessing(text.text))
        # generate labels
        with open(ground_truth_subtexts, 'a') as f:
            f.write(f'{subject}_{i} {label}\n')


100%|██████████| 2348/2348 [01:47<00:00, 21.75it/s]


## Method 3: Text chunking

In [12]:
path_labels = os.path.join('../data/', "risk_golden_truth.txt")
users = []
with open(path_labels, 'r') as f:
    for line in f:
        subject, label = line.split()
        users.append((subject, label))

In [13]:
chunk_data_path = '../data/chunked'
if not os.path.isdir(chunk_data_path):
    os.mkdir(chunk_data_path)
MAX_LEN = 512


In [14]:
ground_truth_chunks = os.path.join('../data/', 'risk_golden_truth_chunks.txt')
for subject, label in tqdm(users):
    if label == '1':
        CHUNK_DELAY = 100
    if label == '0':
        CHUNK_DELAY = MAX_LEN
    user_file = os.path.join('../data', 'raw', subject + '.xml')

    start_time = time.time()
    root = xml.parse(user_file).getroot()
    texts = []
    for i, text in enumerate(root.findall("WRITING/TEXT")):
        if text.text is None:
            continue
        p_text = text_preprocessing(text.text).split()
        texts += p_text

    chunk_list = []
    while texts:
        chunk_list.append(texts[:MAX_LEN])
        texts = texts[CHUNK_DELAY:]

    # break
    for i, chunk in enumerate(chunk_list):
        subject_subtext_path = os.path.join(chunk_data_path, f"{subject}_{i}.txt")
        with open(subject_subtext_path, 'w') as f:
            f.write(" ".join(chunk))
        # # generate labels
        with open(ground_truth_chunks, 'a') as f:
            f.write(f'{subject}_{i} {label}\n')

100%|██████████| 2348/2348 [00:45<00:00, 52.03it/s] 


In [15]:
ground_truth_chunks = os.path.join('../data/', "risk_golden_truth_chunks.txt")
chunk_users = []
with open(ground_truth_chunks, 'r') as f:
    for line in f:
        subject, label = line.split()
        chunk_users.append((subject, label))

In [9]:
ones = sum([int(x) for y, x in chunk_users])
print("1:", ones)
print("0:", len(chunk_users) - ones)


1: 23698
0: 49052


## Train Tokenizer BPE

In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import Whitespace, Sequence, Digits, Punctuation
from tokenizers.normalizers import Lowercase
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import RobertaProcessing
from tokenizers.decoders import BPEDecoder
from glob import glob
from tokenizers import ByteLevelBPETokenizer
files = glob(r"../data/processed/*")
tokenizer_model_path = r"../models/roberta-tokenizer/"
if not os.path.isdir(tokenizer_model_path):
    os.mkdir(tokenizer_model_path)

In [3]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])

tokenizer.train(files,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>"])
tokenizer.post_processor = RobertaProcessing(
    cls=("<s>", tokenizer.token_to_id("<s>")),
    sep=("</s>", tokenizer.token_to_id("</s>"))
)






In [4]:
tokenizer.save_model(tokenizer_model_path)

['../models/roberta-tokenizer/vocab.json',
 '../models/roberta-tokenizer/merges.txt']

In [16]:
from transformers import RobertaTokenizerFast
# Create the tokenizer from a trained one
tokenizer = RobertaTokenizerFast.from_pretrained(r"../models/roberta-tokenizer", max_len=512)

file ../models/roberta-tokenizer/config.json not found
file ../models/roberta-tokenizer/config.json not found


In [17]:
tokenizer

PreTrainedTokenizerFast(name_or_path='../models/roberta-tokenizer', vocab_size=30000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [10]:
tokenizer.encode_plus(
            "Hello, y'all! How are you 😁 ?",
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            # return_tensors="pt",
            # return_attention_mask=True,
            truncation=True,
        )

{'input_ids': [0, 44, 26906, 16, 226, 93, 11, 6565, 5, 226, 44, 6885, 226, 6542, 226, 6498, 226, 178, 259, 252, 229, 226, 35, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1