In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from main import read_file
import classla
classla.download("sl", dir="../models/classla_resources")

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 2.53MB/s]                   
2023-05-25 16:23:40 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2023-05-25 16:23:42 INFO: File exists: ../models/classla_resources/sl/pos/standard.pt.
2023-05-25 16:23:42 INFO: File exists: ../models/classla_resources/sl/lemma/standard.pt.
2023-05-25 16:23:42 INFO: File exists: ../models/classla_resources/sl/depparse/standard.pt.
2023-05-25 16:23:43 INFO: File exists: ../models/classla_resources/sl/ner/standard.pt.
2023-05-25 16:23:43 INFO: File exists: ../models/classla_resources/sl/pretrain/standard.pt.
2023-05-25 16:23:43 INFO: Finished downloading models and saved to ../models/classla_resources.


In [4]:
preprocess = classla.Pipeline("sl", dir="../models/classla_resources", processors="tokenize,pos,lemma,ner")

2023-05-25 16:24:03 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| ner       | standard |

2023-05-25 16:24:03 INFO: Use device: cpu
2023-05-25 16:24:03 INFO: Loading: tokenize
2023-05-25 16:24:03 INFO: Loading: pos
2023-05-25 16:24:16 INFO: Loading: lemma
2023-05-25 16:24:33 INFO: Loading: ner
2023-05-25 16:24:34 INFO: Done loading processors!


In [5]:
from collections import Counter

def get_named_entities(classla_doc, thresh_perc=0.25):
    named_entities_in_doc = []
    nouns_in_doc = []
    for sentence in classla_doc.sentences:
        mention_tokens = []
        for token in sentence.tokens:
            word = token.words[0]
            if len(mention_tokens) > 0 and not token.ner.endswith("-PER"):
                named_entities_in_doc.append(" ".join([t.words[0].lemma for t in mention_tokens]))
            if token.ner.endswith('-PER'):
                mention_tokens.append(token)
            elif word.xpos[0] == "N":
                nouns_in_doc.append(word.lemma)
        if len(mention_tokens) > 0:
            named_entities_in_doc.append(" ".join([t.words[0].lemma for t in mention_tokens]))
    nelf = dict(Counter(named_entities_in_doc))
    nlf = dict(Counter(nouns_in_doc))
    nlf_lim = max(nlf.values()) * thresh_perc
    nlf = {k: v for k, v in nlf.items() if v > nlf_lim}
    return nelf, nlf

def find_mention(id, mentions):
    for mention in mentions:
        if id == mention.mention_id:
            return mention
    return None

def get_relevant_mentions(corefs, ments, ne_candidates):
    return {k: vals for k, vals in corefs.items() if " ".join([t.lemma for t in find_mention(k, ments).tokens]) in ne_candidates}

In [6]:
from coref.resolve_text import Resolver

resolver = Resolver()

INFO:root:Initialized contextual BERT-based model with name cseb_senticoref_suk.
INFO:root:Initialized contextual BERT-based model with name cseb_senticoref_suk.


In [8]:
def get_most_mentioned_characters(text):
    doc = preprocess(text)
    _1, _2 = get_named_entities(doc)
    ne_candidates = {**_1, **_2}
    coref_output = resolver.coref(doc, ne_candidates, 0.5, 10, 8)
    mentions, coreferences = coref_output
    relevant_mentions = get_relevant_mentions(coreferences, mentions, ne_candidates)
    coref_lemma_counts = {" ".join([t.lemma for t in find_mention(k, mentions).tokens]): len(v) for k, v in relevant_mentions.items()}
    lemma_counts = ne_candidates
    for lemma in coref_lemma_counts:
        lemma_counts[lemma] += coref_lemma_counts[lemma]
    return lemma_counts, relevant_mentions, mentions
    

In [9]:
import os

outs = []
for f in os.listdir('../data/slovenian_short_stories/'):
    raw_text = read_file(f'../data/slovenian_short_stories/{f}')
    counts, corefs, mentions = get_most_mentioned_characters(raw_text)
    outs.append(counts)


TypeError: sequence item 0: expected str instance, Token found

In [21]:
import json

def write_output(out_list, fname):
    json_data = json.dumps(out_list)
    with open(f"../data/{fname}.json") as f:
        f.write(json_data)

write_output(outs, "characters_by_stories_slo_short_stories2")

({'oče': 7,
  'hči': 9,
  'vrag': 21,
  'soba': 10,
  'jabolko': 11,
  'pekel': 9,
  'žena': 10,
  'sestra': 10,
  'koš': 16,
  'hiša': 7},
 {2: {8},
  5: {19, 37},
  56: {57},
  57: {60},
  59: {64},
  67: {68, 71},
  93: {95, 101},
  64: {98},
  95: {113},
  124: {125},
  125: {126},
  129: {134},
  134: {136, 146},
  148: {161},
  151: {171},
  183: {185, 193, 206},
  190: {198},
  202: {212},
  206: {228},
  253: {262, 264},
  273: {279},
  279: {312},
  332: {336},
  336: {344, 346},
  338: {350},
  367: {368},
  390: {391},
  466: {469},
  463: {474},
  474: {479, 511},
  501: {519},
  511: {515},
  519: {520}},
 [
  mention_id: 128
  tokens: [
      token_id: 16-12
      raw_text: Moji
      lemma: moj
      msd: Ps1fsds
      sentence_index: 16
      position_in_sentence: 12
      position_in_document: 262
      gender: f
      number: s
      category: P
              ]
              ,
  
  mention_id: 129
  tokens: [
      token_id: 16-13
      raw_text: ženi
      lemma: žen