In [3]:
%load_ext autoreload
%autoreload 2

In [2]:
from main import read_file, get_sentences
import classla
classla.download("sl", dir="../models/classla_resources")

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 5.02MB/s]                   
2023-05-25 09:08:53 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2023-05-25 09:08:54 INFO: File exists: ../models/classla_resources/sl/pos/standard.pt.
2023-05-25 09:08:54 INFO: File exists: ../models/classla_resources/sl/lemma/standard.pt.
2023-05-25 09:08:55 INFO: File exists: ../models/classla_resources/sl/depparse/standard.pt.
2023-05-25 09:08:55 INFO: File exists: ../models/classla_resources/sl/ner/standard.pt.
2023-05-25 09:08:55 INFO: File exists: ../models/classla_resources/sl/pretrain/standard.pt.
2023-05-25 09:08:55 INFO: Finished downloading models and saved to ../models/classla_resources.


In [4]:
text = read_file("../data/slovenian_short_stories/Vrag_se_ženi.txt")

preprocess = classla.Pipeline("sl", dir="../models/classla_resources", processors="tokenize,pos,lemma,ner")

doc = preprocess(text)

2023-05-25 09:08:58 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| ner       | standard |

  return torch._C._cuda_getDeviceCount() > 0
2023-05-25 09:08:59 INFO: Use device: cpu
2023-05-25 09:08:59 INFO: Loading: tokenize
2023-05-25 09:08:59 INFO: Loading: pos
2023-05-25 09:09:09 INFO: Loading: lemma
2023-05-25 09:09:22 INFO: Loading: ner
2023-05-25 09:09:22 INFO: Done loading processors!


In [11]:
from collections import Counter

def get_named_entities(classla_doc, thresh_perc=0.25):
    named_entities_in_doc = []
    nouns_in_doc = []
    for sentence in classla_doc.sentences:
        mention_tokens = []
        for token in sentence.tokens:
            word = token.words[0]
            if len(mention_tokens) > 0 and not token.ner.endswith("-PER"):
                named_entities_in_doc.append(" ".join([w.lemma for t in mention_tokens for w in t.words]))
            if token.ner.endswith('-PER'):
                mention_tokens.append(token)
            elif word.xpos[0] == "N":
                nouns_in_doc.append(" ".join([w.lemma for w in token.words]))
        if len(mention_tokens) > 0:
            named_entities_in_doc.append(" ".join(mention_tokens))
    nelf = dict(Counter(named_entities_in_doc))
    nlf = dict(Counter(nouns_in_doc))
    nlf_lim = max(nlf.values()) * thresh_perc
    nlf = {k: v for k, v in nlf.items() if v > nlf_lim}
    return nelf, nlf

def find_mention(id, mentions):
    for mention in mentions:
        if id == mention.mention_id:
            return mention
    return None

def get_relevant_mentions(corefs, ments, ne_candidates):
    return {k: vals for k, vals in corefs.items() if " ".join([t.lemma for t in find_mention(k, ments).tokens]) in ne_candidates}

In [6]:
from coref.resolve_text import Resolver

resolver = Resolver()

INFO:root:Initialized contextual BERT-based model with name cseb_senticoref_suk.
INFO:root:Initialized contextual BERT-based model with name cseb_senticoref_suk.


In [7]:
def get_most_mentioned_characters(text):
    doc = preprocess(text)
    _1, _2 = get_named_entities(doc)
    ne_candidates = {**_1, **_2}
    coref_output = resolver.coref(doc, ne_candidates, 0.5, 10, 8)
    mentions, coreferences = coref_output
    relevant_mentions = get_relevant_mentions(coreferences, mentions, ne_candidates)
    coref_lemma_counts = {" ".join([t.lemma for t in find_mention(k, mentions).tokens]): len(v) for k, v in relevant_mentions.items()}
    lemma_counts = ne_candidates
    for lemma in coref_lemma_counts:
        lemma_counts[lemma] += coref_lemma_counts[lemma]
    return lemma_counts
    

In [12]:
get_most_mentioned_characters(text)

INFO:root:Evaluating a single document...


0:10


INFO:root:Evaluating a single document...


8:18


INFO:root:Evaluating a single document...


16:26


INFO:root:Evaluating a single document...


24:34


INFO:root:Evaluating a single document...


32:42


INFO:root:Evaluating a single document...


40:50


INFO:root:Evaluating a single document...


48:58


INFO:root:Evaluating a single document...


56:66


INFO:root:Evaluating a single document...


64:67


{'oče': 7,
 'hči': 9,
 'vrag': 21,
 'soba': 10,
 'jabolko': 11,
 'pekel': 9,
 'žena': 10,
 'sestra': 10,
 'koš': 16,
 'hiša': 7}

In [53]:
def get_leading_mention(mentions, cluster, ne_candidates):
    print("=====================")
    print(ne_candidates)
    maxi = -1
    maxcount = 0
    for i in cluster:
        if mentions[i]["ner_type"].endswith('-PER'):
            lemma = mentions[i]["lemmas"][0]
            if lemma in ne_candidates:
                if ne_candidates[lemma] > maxcount:
                    maxi = i
                    maxcount = ne_candidates[lemma]
    if maxi < 0:
        for i in cluster:
            lemma = mentions[i]["lemmas"][0]
            if lemma in ne_candidates:
                if ne_candidates[lemma] > maxcount:
                    maxi = i
                    maxcount = ne_candidates[lemma]

    return maxi if maxi >= 0 else None

mentions, clusters = coref_output
leading_clusters = list(map(lambda x: get_leading_mention(mentions, x, ne_candidates), clusters))
leading_clusters

{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'pekel': 8, 'žena': 8, 'sestra': 9, 'koš': 14, 'hiša': 6}
{'oče': 6, 'hči': 6, 'vrag': 20, 'soba': 9, 'jabolko': 10, 'peke

[129,
 2,
 390,
 519,
 273,
 148,
 151,
 None,
 183,
 56,
 None,
 64,
 67,
 None,
 None,
 466,
 367,
 None,
 253,
 None,
 510,
 511]

In [4]:
from main import get_named_entities
from collections import Counter
from main import read_file, get_sentences
import classla

preprocess = classla.Pipeline("sl", dir="../models/classla_resources", processors="tokenize,pos,lemma,ner")

def get_most_mentioned_characters(raw_text):
    # tokenize, pos, lemma, ner
    doc_ne = preprocess(raw_text)

    # extract only NE with PERSON tag
    named_entities = get_named_entities(doc_ne, lang='sl')
    names = []
    for ne in named_entities:
        for named_entity in ne:
            names.append(named_entity.to_dict()[0]['lemma'])

    # count frequencies of each name and sort them
    name_frequencies = Counter(names)
    name_frequencies = {k:v for (k,v) in sorted(name_frequencies.items(), key=lambda item: item[1], reverse=True)}

    return name_frequencies

2023-04-26 18:16:38 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| ner       | standard |

2023-04-26 18:16:38 INFO: Use device: cpu
2023-04-26 18:16:38 INFO: Loading: tokenize
2023-04-26 18:16:38 INFO: Loading: pos
2023-04-26 18:16:52 INFO: Loading: lemma
2023-04-26 18:17:06 INFO: Loading: ner
2023-04-26 18:17:06 INFO: Done loading processors!


In [28]:
from coref.run_model import coref_resolution, get_slocoref

vrag = read_file("../data/slovenian_short_stories/Vrag_se_ženi.txt")
# doc = preprocess("""Bila sta oče in mati, ki sta imela tri hčere. Nesreča pa je hotela, da se jim ni približal niti en ženin. To je močno jezilo mater, oče pa je rekel: »Hočem jih omožiti, če jih imam dati samemu hudiču!«

# Vrag je bil takoj pripravljen, da ugrabi tri duše. Računal pa je na žensko radovednost. Napravljen kot grof je prišel k očetu in ga zaprosil za starejšo hčer. Z velikim veseljem mu jo je dal, češ: če je ne dam takemu gospodu, komu pak?! Vrag jo je odpeljal v neki navidezen grad in ji rekel: »Drugega opravila nimaš, kot da nosiš iz sobe v sobo tole zlato jabolko, le v dvanajsto sobo ne smeš pogledati!« Ona vzame zlato jabolko ter teka po gradu iz sobe v sobo. Pride do vrat dvanajste sobe; tu postoji in si misli: Kaj neki bi bilo, četudi pogledam? Do zdaj je bila vsaka soba lepša, in kdo ve, kaj je šele v tej? Radovednost jo premaga, odpre in zagleda pekel, kako vragi mučijo uboge duše. Zlato jabolko pa ji je padlo v pekel, kjer je zgorelo. Vsa zmučena zapre vrata in teče proč. Tedaj jo sreča njen mož – vrag. »Kje je jabolko?« zavpije in pahne še njo v peklensko brezno.""")

doc = preprocess("Bila sta oče in mati, ki sta imela tri hčere. Nesreča pa je hotela, da se jim ni približal niti en ženin. To je močno jezilo mater, oče pa je rekel: »Hočem jih omožiti, če jih imam dati samemu hudiču!«")

# Requires "tokenize,pos,lemma,ner" preprocessors
# coref_output = coref_resolution(doc, 0.1, False, window_size=10, window_stride=8)
coref_output = coref_resolution(doc, 0.3, False)

coref_output


INFO:root:Evaluating a single document...


[['0-0', '0-1', '0-2', '0-3', '0-4', '0-5', '0-6', '0-7', '0-8', '0-9', '0-10', '0-11'], ['1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9', '1-10', '1-11', '1-12', '1-13'], ['2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10', '2-11', '2-12', '2-13', '2-14', '2-15', '2-16', '2-17', '2-18', '2-19', '2-20', '2-21', '2-22', '2-23']]
{1: <coref.data.Mention object at 0x7f3beeb54190>, 2: <coref.data.Mention object at 0x7f3beeb54100>, 3: <coref.data.Mention object at 0x7f3beeb54a60>, 4: <coref.data.Mention object at 0x7f3beeabcfa0>, 5: <coref.data.Mention object at 0x7f3beeabd030>, 6: <coref.data.Mention object at 0x7f3beeabd120>, 7: <coref.data.Mention object at 0x7f3beeabd1e0>, 8: <coref.data.Mention object at 0x7f3beeabd2a0>, 9: <coref.data.Mention object at 0x7f3beeabd420>, 10: <coref.data.Mention object at 0x7f3beeabd4e0>, 11: <coref.data.Mention object at 0x7f3beeabd5d0>, 12: <coref.data.Mention object at 0x7f3beeabd660>, 13: <coref.data.Men

([{'id': 1,
   'start_idx': 0,
   'length': 8,
   'ner_type': 'O',
   'msd': 'Va-p-dm',
   'text': 'Bila sta'},
  {'id': 2,
   'start_idx': 9,
   'length': 3,
   'ner_type': 'O',
   'msd': 'Ncmsn',
   'text': 'oče'},
  {'id': 3,
   'start_idx': 16,
   'length': 4,
   'ner_type': 'O',
   'msd': 'Ncfsn',
   'text': 'mati'},
  {'id': 4,
   'start_idx': 25,
   'length': 9,
   'ner_type': 'O',
   'msd': 'Va-r3d-n',
   'text': 'sta imela'},
  {'id': 5,
   'start_idx': 39,
   'length': 5,
   'ner_type': 'O',
   'msd': 'Ncfpa',
   'text': 'hčere'},
  {'id': 6,
   'start_idx': 46,
   'length': 7,
   'ner_type': 'O',
   'msd': 'Ncfsn',
   'text': 'Nesreča'},
  {'id': 7,
   'start_idx': 57,
   'length': 9,
   'ner_type': 'O',
   'msd': 'Va-r3s-n',
   'text': 'je hotela'},
  {'id': 8,
   'start_idx': 71,
   'length': 6,
   'ner_type': 'O',
   'msd': 'Px------y',
   'text': 'se jim'},
  {'id': 9,
   'start_idx': 78,
   'length': 12,
   'ner_type': 'O',
   'msd': 'Va-r3s-y',
   'text': 'ni približal

In [31]:
mentions = coref_output[0]
clusters = coref_output[1]
for mention in clusters[0]:
    print(mentions[mention - 1])

{'id': 1, 'start_idx': 0, 'length': 8, 'ner_type': 'O', 'msd': 'Va-p-dm', 'text': 'Bila sta'}
{'id': 2, 'start_idx': 9, 'length': 3, 'ner_type': 'O', 'msd': 'Ncmsn', 'text': 'oče'}
{'id': 3, 'start_idx': 16, 'length': 4, 'ner_type': 'O', 'msd': 'Ncfsn', 'text': 'mati'}
{'id': 4, 'start_idx': 25, 'length': 9, 'ner_type': 'O', 'msd': 'Va-r3d-n', 'text': 'sta imela'}
{'id': 6, 'start_idx': 46, 'length': 7, 'ner_type': 'O', 'msd': 'Ncfsn', 'text': 'Nesreča'}
{'id': 7, 'start_idx': 57, 'length': 9, 'ner_type': 'O', 'msd': 'Va-r3s-n', 'text': 'je hotela'}
