Reader for the [VRT](https://www.kielipankki.fi/development/korp/corpus-input-format/#VRT_file_format) (verticalized text) format used e.g. for the Suomi24 data.

In [1]:
import re
txt_re=re.compile(r'^<text discussionarea="(.*?)".*tid="([0-9]+?)"',re.U)
ignore_re=re.compile(r'^</?(text|sentence|paragraph)')


def read_vrt(inp):
    """Function to read the Suomi24 VRT format"""
    current_topic=None #topic name
    current_tid=None #discussion thread number
    words=[] #words in the discussion
    for line in inp:
        line=line.strip()
        match=txt_re.match(line)
        if match: #we have a new post
            if match.group(2)!=current_tid and words:#...and it is not part of the current thread
                yield current_topic, current_tid, words
                words=[]
            current_topic=match.group(1) #Pick groups out of the regular expression
            current_tid=match.group(2)
        if ignore_re.match(line):
            continue
        columns=line.split(u"\t")
        if not columns[1].isdigit(): #there seem to be few broken ones, skip
            continue
        words.append(columns[2].lower())
    else: #for loop ran out of items
        if words:
            yield current_topic, current_tid, words

topics=[] #list of strings
topic_ids=[]
texts=[] #list of strings
with open("s24.vrt", "r", encoding="utf-8") as f:
    for topic, tid, words in read_vrt(f):
        topics.append(topic)
        topic_ids.append(tid)
        texts.append(u" ".join(words))

print("Document count:", len(topics))
print("Distinct topics:", u", ".join(set(topics)))

Document count: 12453
Distinct topics: Yhteiskunta, Tori, Urheilu ja kuntoilu, Ryhmät, Ajoneuvot ja liikenne, Muoti ja kauneus, Ruoka ja juoma, Lemmikit, Tiede ja teknologia, MainPage, Paikkakunnat, Viihde ja kulttuuri, Työ ja opiskelu, Talous, Terveys, Harrastukset, Suhteet, Ajanviete, Suomi24, Perhe, Matkailu, Koti ja rakentaminen, Nuoret
