# Reading and cleaning

In [79]:
import re
from tqdm.auto import tqdm
from rich import print, inspect

In [10]:
with open("log.txt", 'r', encoding='utf-8') as f:
    log = f.readlines()

In [11]:
log = filter(lambda x: "<entity>" in x, log) # remove all logs without "<entity>" tag

In [12]:
log = map(lambda x: x.split(":")[0].strip(), log) # remove canonical named entity

In [13]:
log = list(log)

In [14]:
log[:10]

['вруби <entity>юниктьюнз#singer</entity>',
 'проиграй <entity>турецк гамбит#film</entity>',
 'проиграй <entity>Ancora Vivo#song</entity>',
 'спой <entity>нэж #song</entity>',
 'давай включи <entity>Escape The Fate#singer</entity>',
 'давай автора <entity>мэгадес#singer</entity>',
 'зажги <entity>кошмар на улице вязов#film</entity>',
 'расскажи <entity>бесконечн истори#film</entity>',
 'давай включи <entity>Sweets For My Sweet#song</entity>',
 'пой песню <entity>Cloud Nine#song</entity> из <entity>област тьмы#film</entity>']

In [54]:
x = re.finditer("<entity>(.+?)</entity>", log[9])

In [55]:
match = list(x)[0]

In [57]:
match.regs

((10, 42), (18, 33))

In [40]:
log[9][18:33]

'Cloud Nine#song'

In [24]:
log[0][x.span()[0]:x.span()[1]]

'<entity>юниктьюнз#singer</entity>'

In [80]:
def log2conll(log_entry: str):
    result_words = []
    result_tags = []
    prev_index = 0
    for match in re.finditer("<entity>(.+?)</entity>", log_entry):
        #print(match)
        entity_start, entity_end = match.span()
        pre_entity = log_entry[prev_index:entity_start].strip()
        entity = log_entry[entity_start:entity_end].replace("<entity>", "").replace("</entity>", "")
        
        for word in pre_entity.split():
            result_words.append(word)
            result_tags.append("O")
            
        entity_words, entity_tag = entity.split("#")
        for i, entity_word in enumerate(entity_words.strip().split()):
            result_words.append(entity_word)
            if i == 0:
                result_tags.append("B-" + entity_tag.upper())
            else:
                result_tags.append("I-" + entity_tag.upper())
            
        prev_index = entity_end
        
    return list(zip(result_words, result_tags))

In [81]:
print(log2conll(log[9]))

In [82]:
for i in [100, 122, 1222, 9000]:
    print("--"*30)
    print(log[i])
    print(log2conll(log[i]))

In [83]:
from sklearn.model_selection import train_test_split

In [89]:
train_dev, test = train_test_split(log, test_size=0.2, shuffle=True, random_state=42)
train, dev = train_test_split(train_dev, test_size=0.1, shuffle=True, random_state=42)

In [90]:
with open("train.conll", 'w', encoding='utf-8') as f:
    for log_entry in tqdm(train):
        data = log2conll(log_entry)
        for word, tag in data:
            f.write(f"{word}\t{tag}\n")
            
        f.write("\n")

HBox(children=(FloatProgress(value=0.0, max=6544.0), HTML(value='')))




In [91]:
with open("dev.conll", 'w', encoding='utf-8') as f:
    for log_entry in tqdm(dev):
        data = log2conll(log_entry)
        for word, tag in data:
            f.write(f"{word}\t{tag}\n")
            
        f.write("\n")

HBox(children=(FloatProgress(value=0.0, max=728.0), HTML(value='')))




In [92]:
with open("test.conll", 'w', encoding='utf-8') as f:
    for log_entry in tqdm(test):
        data = log2conll(log_entry)
        for word, tag in data:
            f.write(f"{word}\t{tag}\n")
            
        f.write("\n")

HBox(children=(FloatProgress(value=0.0, max=1818.0), HTML(value='')))


