In [None]:
# pip install pandas
# pip install tqdm
# pip install transformers
# pip install torch
# pip install morfeusz2
# pip install spacy
# pip install rapidfuzz

In [None]:
import pandas as pd
import goose3
from tqdm import tqdm
tqdm.pandas()
import re
import morfeusz2
import spacy
import subprocess
import sys
# subprocess.check_call([sys.executable, "-m", "spacy", "download", "pl_core_news_lg"])
nlp_spacy = spacy.load("pl_core_news_lg")

import requests
from goose3 import Goose


from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_checkpoint = "pczarnik/herbert-base-ner"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

nlp_herbert_ner = pipeline("ner", model=model, tokenizer=tokenizer)

import stanza
stanza.download('pl')
nlp_stanza = stanza.Pipeline('pl', processors='tokenize,ner')

from rapidfuzz import fuzz



In [None]:
# FETCH ARTICLES

# df = pd.read_json('data/articles_koryta.jsonl', lines=True)
# g = Goose()

# def fetch_article(url):
#     try:
#         r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=10)
#         r.raise_for_status()
#         r.encoding = r.apparent_encoding
#         article = g.extract(raw_html=r.text)
#         return {
#             "title": article.title,
#             "text": article.cleaned_text,
#             "meta_description": article.meta_description,
#         }
#     except:
#         return {
#             "title": None,
#             "text": None,
#             "meta_description": None
#         }

# df_temp = df['url'].progress_apply(fetch_article)
# df[['title','text','meta_description']] = df_temp.apply(pd.Series)
# df.to_json('data/scraped_articles.json')

In [None]:
## FUNCTIONS
def extract_entities_herbert(text):

    ner_output = nlp_herbert_ner(text)
    entities = {
        'PER': [],
        'LOC': [],
        'ORG': []
    }

    current_entity = []
    current_type = None

    for token in ner_output:
        tag = token['entity']
        word = token['word'].replace('</w>', ' ')

        if tag.startswith('B-'):
            if current_entity and current_type:
                entities[current_type].append(''.join(current_entity))
            current_type = tag[2:]
            current_entity = [word]

        elif tag.startswith('I-') and current_type == tag[2:]:
            current_entity.append(word)

        else:
            if current_entity and current_type:
                entities[current_type].append(''.join(current_entity))
            current_entity = []
            current_type = None

    if current_entity and current_type:
        entities[current_type].append(''.join(current_entity))

    return entities

def fix_spacing_full_names(full_name):
    # tokenize string
    tokens = re.findall(r'\b\w+\b', full_name)

    if not tokens:
        return entity_str

    result = tokens[0]
    for token in tokens[1:]:
        if token[0].isupper():
            result += ' ' + token
        else:
            result += token 

    return result.strip()

# # DENOMINATIVE FULL NAMES
# def name_to_nominative_morf(full_name):
#     morfeusz = morfeusz2.Morfeusz()
#     words = full_name.split()
#     nominative_words = []

#     for word in words:
#         analyses = morfeusz.analyse(word)
#         selected_lemma = None

#         for _, _, morph in analyses:
#             lemma = morph[0]
#             meanings = morph[3]  # lista znaczeń

#             if 'imię' in meanings or 'nazwisko' in meanings:
#                 selected_lemma = lemma
#                 break  # bierzemy pierwszą napotkaną formę z imieniem lub nazwiskiem

#         if selected_lemma:
#             nominative_words.append(selected_lemma)
#         else:
#             # fallback - pierwsza lemma bez względu na znaczenia
#             nominative_words.append(analyses[0][2][0])

#     return ' '.join(nominative_words)

def lemmatize_name_spacy(name):
    doc = nlp_spacy(name)
    lemmatized = [token.lemma_ for token in doc]
    return " ".join(lemmatized)

def extract_stanza(document, pos_type):
    #persName, orgName, placeName
    findings = []
    current = []

    for sentence in document.sentences:
        for token in sentence.tokens:
            ner_tag = token.ner
            text = token.text

            if ner_tag == f'B-{pos_type}':
                current.append(text)
            elif ner_tag == f'I-{pos_type}':
                current.append(text)
            elif ner_tag == f'E-{pos_type}':
                current.append(text)
                full = ' '.join(current)
                findings.append(full)
                current_name = []
            elif ner_tag == f'S-{pos_type}':
                findings.append(text)

    return findings

def texts_similarity(s1, s2):
    score = fuzz.ratio(s1, s2)
    return score / 100.0

def max_with_default_zero(lst):
    return max(lst) if lst else 0


# double metaphone 

In [None]:
df = pd.read_json('data/scraped_articles.json')
df = df[df['text']!='']
df = df[~(df['text'].isnull())]
df.reset_index(inplace=True, drop=True)

In [None]:
## HERBERT
df['herbert_ner_entities'] = df['text'].progress_apply(extract_entities_herbert)
df[['PER_herbert','LOC_herbert','ORG_herbert']] = df['herbert_ner_entities'].apply(pd.Series)
df['PER_herbert'] = df['PER_herbert'].apply(lambda x: [fix_spacing_full_names(full_name) for full_name in x]).apply(lambda x: [lemmatize_name_spacy(full_name) for full_name in x])
df['PER_match_herbert'] = df.apply(lambda x: max_with_default_zero([texts_similarity(x['mentioned_person'], elem) for elem in x['PER_herbert']]),axis=1) > 0.8

In [None]:
## STANZA
df['stanza_entities'] = df['text'].progress_apply(nlp_stanza)
df['PER_stanza'] = df['stanza_entities'].progress_apply(extract_stanza, args=('persName',)).apply(lambda x: [fix_spacing_full_names(elem) for elem in x]).apply(lambda x: [lemmatize_name_spacy(elem) for elem in x])
df['ORG_stanza'] = df['stanza_entities'].progress_apply(extract_stanza, args=('orgName',))
df['LOC_stanza'] = df['stanza_entities'].progress_apply(extract_stanza, args=('placeName',))
df['PER_match_stanza'] = df.apply(lambda x: max_with_default_zero([texts_similarity(x['mentioned_person'], elem) for elem in x['PER_stanza']]),axis=1) > 0.8