In [1]:
from extract import Extractor
from WikiExtractor import collect_pages, decode_open

In [29]:
%cd D:\WorkFolder\hub_vectors

from taxoenrich.models import RuWordNet

thesaurus = RuWordNet(r'D:\WorkFolder\data\models\rwn-2021-05-02')

D:\WorkFolder\hub_vectors


In [3]:
list(thesaurus.senses)[:10]

['рыбный_хозяйство',
 'жуковчанин',
 'наркосбытчица',
 'очамчира',
 'партнер',
 'тепловизионный',
 'попомнить',
 'дирижировать',
 'молиться',
 'радиосвязь']

In [4]:
[s for s in thesaurus.senses if '(' in s]

[]

In [27]:
import pymorphy2
from functools import lru_cache
morph_analizer = pymorphy2.MorphAnalyzer()

@lru_cache(maxsize=200000)
def get_normal_form(word):
    return morph_analizer.parse(word)[0].normal_form

def check_in_thesaurus(title, thesaurus):
    title = title.lower().replace(' ', '_')
    if title in thesaurus.senses:
        return title
    
    if '_(' in title:
        title = title[:title.find('_(')]
    if title in thesaurus.senses:
        return title

    if '(' in title:
        title = title[:title.find('(')]
    if title in thesaurus.senses:
        return title

    title = '_'.join([get_normal_form(w) for w in title.split('_')])
    if title in thesaurus.senses:
        return title

    return ''
    
def check_wiki_synset_in_thesaurus(wiki_synset, thesaurus):
    return len([p.title for p in wiki_synset.synset if len(check_in_thesaurus(p.title, thesaurus)) > 0]) > 0


In [3]:
import re
from tqdm import tqdm

#meanings_regex = re.compile(r'\{\{другие значения\|(.*?)\}\}|\{\{значения\|(.*?)\}\}|\{\{другие значения термина\|(.*?)\}\}')
references_regex = re.compile(r'\[\[(.*?)\]\]')

def check_disambiguation_page(text):
    disambiguation_phrases = ['{{неоднозначность', '{{многозначность', '{{disambig']
    for phrase in disambiguation_phrases:
        if phrase in text:
            return True
    return False

def check_ambiguous_page(text):
    ambiguous_phrases = ['{{значения', '{{другие значения', '{{другое значение']
    for phrase in ambiguous_phrases:
        if phrase in text:
            return True
    return False

def get_references(lines):
    ref2line = {}
    for line in lines:
        references = re.findall(references_regex, line)
        if len(references) > 0:
            title = references[0]
            if '|' in title:
                title = title[:title.find('|')]
            if '#' in title:
                continue
            ref2line[title] = line
    return ref2line

'''
def get_redirects(lines):
    redirect_page = False
    text = ''.join(lines)
    redirect_phrases = ['#перенаправление', '#redirect']
    for phrase in redirect_phrases:
        if phrase in text.lower():
            redirect_page = True
            break

    if not redirect_page:
        return []

    redirect_lines = []
    for line in lines:
        for phrase in redirect_phrases:
            if phrase in line.lower():
                redirect_lines.append(line)

    return get_references(redirect_lines)
'''

"\ndef get_redirects(lines):\n    redirect_page = False\n    text = ''.join(lines)\n    redirect_phrases = ['#перенаправление', '#redirect']\n    for phrase in redirect_phrases:\n        if phrase in text.lower():\n            redirect_page = True\n            break\n\n    if not redirect_page:\n        return []\n\n    redirect_lines = []\n    for line in lines:\n        for phrase in redirect_phrases:\n            if phrase in line.lower():\n                redirect_lines.append(line)\n\n    return get_references(redirect_lines)\n"

In [10]:
class Page:
    def __init__(self, title, page_id, is_amb, is_disamb, disamb_refs, redirect):
        self.title = title
        self.page_id = page_id
        self.is_amb = is_amb
        self.is_disamb = is_disamb
        self.disamb_refs = disamb_refs
        self.redirect = redirect

class WikiSynset():
    def __init__(self, page):
        self.page = page
        self.synset = [page]

    def append(self, redirect_page):
        self.synset.append(redirect_page)

def collect_data(dump_path):
    #title2id = {}
    #redirect_pages = {}
    #ambiguous_pages = set()
    #disamb_pages = set()
    #title2disamb = {}
    pages = []
    file_handler = decode_open(dump_path)
    for i, data in enumerate(tqdm(collect_pages(file_handler))):
        is_amb = False
        is_disamb = False
        references = []

        title = data[2]
        page_id = data[0]
        lines = data[3]
        text = ''.join(data[3])
        text_l = text.lower()
        #title2id[title] = page_id
        redirect_page = data[4]
        #if len(redirect_page) > 0:
        #    redirect_pages[title] = redirect_page

        if check_ambiguous_page(text_l):
            is_amb = True
            #ambiguous_pages.add(title)

        if check_disambiguation_page(text_l):
            is_disamb = True
            #disamb_pages.add(title)
            references = get_references(lines)
            '''
            for ref_title, ref_line in references.items():
                if ref_title in redirect_pages:
                    ref_title = redirect_pages[ref_title]
                if ref_title not in title2disamb:
                    title2disamb[ref_title] = []
                title2disamb[ref_title].append([title, ref_line])
            '''

        pages.append(Page(title, page_id, is_amb, is_disamb, references, redirect_page))

    return pages

In [7]:
dump_path = r'D:\WorkFolder\data\ruwiki-20220701-pages-meta-current.xml.bz2'
pages = collect_data(dump_path)

4432391it [21:08, 3494.21it/s] 


In [9]:
len(pages), len([p for p in pages if p.is_amb]), len([p for p in pages if p.is_disamb]), len([p for p in pages if len(p.redirect) > 0])

(4432391, 297976, 129000, 2606107)

In [11]:
title2page = {p.title: p for p in pages}

In [20]:
wiki_synsets = {p.title: WikiSynset(p) for p in pages if len(p.redirect) == 0}
for p in tqdm(pages):
    if len(p.redirect) > 0 and p.redirect in wiki_synsets:
        wiki_synsets[p.redirect].append(p)

len(wiki_synsets)

100%|██████████| 4432391/4432391 [00:03<00:00, 1111990.03it/s]


1826284

In [22]:
def wiki_synset_is_amb(wiki_synset):
    return len([page for page in wiki_synset.synset if page.is_amb]) > 0

def wiki_synset_is_disamb(wiki_synset):
    return len([page for page in wiki_synset.synset if page.is_disamb]) > 0

In [24]:
wiki_synsets_amb = set([ws for ws in wiki_synsets if wiki_synset_is_amb(wiki_synsets[ws])])
wiki_synsets_disamb = set([ws for ws in wiki_synsets if wiki_synset_is_disamb(wiki_synsets[ws])])
wiki_synsets_single_meaning = set([ws for ws in wiki_synsets if not wiki_synset_is_disamb(wiki_synsets[ws]) and not wiki_synset_is_amb(wiki_synsets[ws])])

In [25]:
len(wiki_synsets_amb), len(wiki_synsets_disamb), len(wiki_synsets_single_meaning)

(297975, 128994, 1401921)

In [31]:
wiki_synsets_in_thes = {title: ws for title, ws in tqdm(wiki_synsets.items()) if check_wiki_synset_in_thesaurus(ws, thesaurus)}
len(wiki_synsets_in_thes)

100%|██████████| 1826284/1826284 [06:01<00:00, 5051.90it/s]


121035

In [34]:
len([t for t in tqdm(title2page) if check_in_thesaurus(t, thesaurus)])

100%|██████████| 4432391/4432391 [06:41<00:00, 11029.80it/s]


164716

In [15]:
p.title, p.redirect

('Русские Американцы', 'Русская диаспора в США')

In [19]:
title2page['Русская диаспора в США'].title, title2page['Русская диаспора в США'].redirect

('Русская диаспора в США', 'Русские американцы')

In [7]:
def collect_titles(dump_path):
    titles = []
    redirect_pages = {}
    file_handler = decode_open(dump_path)
    for i, data in enumerate(tqdm(collect_pages(file_handler))):
        title = data[2]
        doc_word_len = data[6]
        titles.append([title, doc_word_len])
        if data[4] and len(data[5]) > 0:
            redirect_pages[title] = data[5]
    return titles, redirect_pages

def collect_redirects(dump_path, all_titles):
    redirect_pages = {}
    file_handler = decode_open(dump_path)
    for i, data in enumerate(tqdm(collect_pages(file_handler))):
        lines = data[3]
        title = data[2]
        is_redirect = data[-1]
        redirects = get_redirects(lines, all_titles)
        if is_redirect != len(redirects) > 0:
            print(data)
        if len(redirects) > 0:
            redirect_pages[title] = redirects
    return redirect_pages
        

In [None]:
from tqdm import tqdm

def collect
file_handler = decode_open(dump_path)
ambiguous_words = set()
title2id = {}
meanings_pages = []
title2disamb = {}
disamb_pages = set()
for i, data in enumerate(tqdm(collect_pages(file_handler))):
    lines = data[3]
    text = ''.join(lines)
    page_id = data[0]
    title = data[2]

    title2id[title] = page_id
    if check_ambiguous_page(text):
        ambiguous_words.add(title)
    elif check_disambiguation_page(text):
        disamb_pages.add(title)
        references = get_references(lines, all_title_names)
        for ref_title, ref_line in references.items():
            if ref_title in redirect_pages:
                ref_title = redirect_pages[ref_title]
            if ref_title not in title2disamb:
                title2disamb[ref_title] = []
            title2disamb[ref_title].append([title, ref_line])

In [8]:
dump_path = r'D:\WorkFolder\data\ruwiki-20220701-pages-meta-current.xml.bz2'
all_titles, redirect_pages = collect_titles(dump_path)
all_title_names = set([t[0] for t in all_titles])

4432391it [20:17, 3641.98it/s] 


In [8]:
sorted([[t, l] for t, l in all_titles if t not in redirect_pages], key=lambda x: (x[1], x[0]))[:10]

[['Proton Perdana (второе поколение)', 0],
 ['Proton Perdana (первое поколение)', 0],
 ['Бутандиол', 0],
 ['Гомосексуальная порнография', 0],
 ['Кематен', 0],
 ['Немировичи', 0],
 ['Санкт-Леонхард', 0],
 ['Санкт-Марайн', 0],
 ['Черёмушки (усадьба)', 0],
 ['China', 1]]

In [106]:
len([t for t in tqdm(list(all_title_names)[:200000]) if not check_in_thesaurus(t)])

100%|██████████| 200000/200000 [00:00<00:00, 391381.83it/s]


192578

In [107]:
titles_in_thesaurus = set()
for title in tqdm(all_title_names):
    if check_in_thesaurus(title):
        titles_in_thesaurus.add(title)

100%|██████████| 4432391/4432391 [08:17<00:00, 8914.92it/s]


In [9]:
class WikiSynset():
    def __init__(self, title):
        self.title = title
        self.synset = [title]

    def append(self, redirect_title):
        self.synset.append(redirect_title)

In [13]:
len(redirect_pages), len(all_title_names)

(2606107, 4432391)

In [10]:
wiki_synsets = {title: WikiSynset(title) for title in all_title_names if title not in redirect_pages}
for title in tqdm(redirect_pages):
    redirect_to = redirect_pages[title]
    if redirect_to in wiki_synsets:
        wiki_synsets[redirect_to].append(title)

100%|██████████| 2606107/2606107 [00:03<00:00, 784499.73it/s]


In [11]:
len(wiki_synsets)

1826284

In [146]:
wiki_synsets_in_thesaurus = []
for main_title, wiki_synset in wiki_synsets.items():
    for title in wiki_synset.synset:
        if title in titles_in_thesaurus:
            wiki_synsets_in_thesaurus.append(wiki_synset)

In [147]:
len(wiki_synsets_in_thesaurus)

142447

In [139]:
len(wiki_synsets_in_thesaurus)

164645

In [155]:
def check_if_ambi_wikisynset(wiki_synset, title2disamb):
    for title in wiki_synset.synset:
        if title in title2disamb:
            return True

    return False
    

In [157]:
len([s for s in wiki_synsets_in_thesaurus if not check_if_ambi_wikisynset(s, title2disamb)]), len(wiki_synsets_in_thesaurus)

(33626, 142447)

In [159]:
[s.title for s in wiki_synsets_in_thesaurus if not check_if_ambi_wikisynset(s, title2disamb)][:10]

['Карбас (деревня)',
 'Месть (фильм, 1970)',
 'Лага, Март Эрихович',
 'Страж (канонерская лодка)',
 'Дератизация (Секретные материалы)',
 'Устрица',
 'Мерседес (муниципалитет)',
 'Компания с ограниченной ответственностью',
 'Компания с ограниченной ответственностью',
 'Юго-Запад (исторический район)']

In [148]:
[s.synset for s in wiki_synsets_in_thesaurus[:10]]

[['Петропавловка (Красногвардейский район)'],
 ['Лагерь (фильм, 2007)'],
 ['Прогресс (Кугарчинский район)'],
 ['Шахе (посёлок)'],
 ['Карбас (деревня)'],
 ['Месть (фильм, 1970)'],
 ['Октябрьское (Майский район)', 'Сельское поселение Октябрьское'],
 ['Лага, Март Эрихович', 'Март Эрихович Лага', 'Лага Март Эрихович', 'Лага'],
 ['Концертная программа', 'Сет-лист', 'Сет (музыка)'],
 ['Концертная программа', 'Сет-лист', 'Сет (музыка)']]

In [152]:
title2disamb['Прогресс (Кугарчинский район)']

[['Прогресс (значения)',
  '* [[Прогресс (Кугарчинский район)|Прогресс]]\xa0— деревня в Кугарчинском районе Башкортостана.\n']]

In [137]:
len(wiki_synsets)

1826284

In [134]:
title

'Magic The Gathering'

In [135]:
redirect_pages[title]

'Magic: The Gathering'

In [109]:
len(titles_in_thesaurus)

164716

In [111]:
titles_norm_in_thesaurus = set([check_in_thesaurus(t) for t in titles_in_thesaurus])

In [112]:
len(titles_norm_in_thesaurus)

39612

In [119]:
thesaurus.synsets['130542-N']

<taxoenrich.models.SynSet at 0x29c4c09da00>

In [121]:
thesaurus_noun_senses = set()
for synset in thesaurus.synsets.values():
    if synset.synset_type == 'N':
        thesaurus_noun_senses.update(synset.synset_words)

len(thesaurus_noun_senses)

85438

In [125]:
thesaurus.sense2synid['город_родник'][0]

'101008-N'

In [128]:
thesaurus.synsets['101008-N'].synset_words

{'город_родник'}

In [None]:
[t for t in thesaurus_noun_senses if t not in titles_norm_in_thesaurus][:100]

In [5]:
len(redirect_pages)

2606107

In [7]:
len([title for title in all_titles if title not in redirect_pages])

1826284

In [None]:
redirect_pages = collect_redirects(dump_path, all_titles)

In [None]:
len([t for t in all_titles if t not in redirect_pages])

In [None]:
redirect_pages['Петроград']

In [None]:
for rp, redirects in redirect_pages.items():
    if len(redirects) > 1:
        print(rp)
        break

In [None]:
rp

In [None]:
redirects

4432391it [27:42, 2665.96it/s] 


In [28]:
len(title2id)

4432391

In [29]:
len(ambiguous_words)

297976

In [30]:
len(disamb_pages)

126397

In [26]:
len([title for title in title2disamb if title in ambiguous_words])

NameError: name 'title2disamb' is not defined

In [78]:
[title for title in title2disamb if title in ambiguous_words][6950:6960]

['ИРЭ-Полюс',
 'Полюс (космический аппарат)',
 'Полюса (альбом)',
 'Уды',
 'Яр Сухой Донец',
 'Донец (Ленинградская область)',
 'Донец (Орловская область)',
 'Донец (Смоленская область)',
 'Донец (Балаклейский район)',
 'Донец (Змиёвский район)']

In [87]:
title2disamb['Генерал-адмирал Апраксин (броненосец)']

[['Окиносима (значения)',
  '* [[Генерал-адмирал Апраксин (броненосец)|Окиносима]] — название японского броненосца; до этого, в ВМФ России, носил имя «Генерал-адмирал Апраксин».\n']]

In [93]:
d

['Сосново',
 '* [[Сосново (Курганская область)|Сосново]]\xa0— деревня в Мишкинском районе Курганской области.\n']

In [94]:
for t, data in title2disamb.items():
    for d in data:
        if '—' not in d[1]:
            print(d)

['Лебяжий', '* Лебяжий рукав р. [[Волга]] (недалеко от г. Астрахань)\n']
['ГАС', '*[[ГАЗ]]\n']
['Горьковский', '* [[Горьковский автомобильный завод]]\n']
['Имени Молотова', '* [[Горьковский автомобильный завод|Завод имени Молотова]]\n']
['Щербаков', ' |isbn          = 5-89216-001-7 }}&lt;/ref&gt;. Известна с XVII века как дворянская фамилия [[Вятка|Вятки]]&lt;ref&gt;[https://rodnaya-vyatka.ru/families/shcherbakov Щербаков]&lt;/ref&gt;, а также [[Нижегородская губерния|Нижегородской]]&lt;ref&gt;[http://www.gttp.ru/subs/bouquet_2.htm Список дворянских родов Нижегородской губернии]&lt;/ref&gt; и [[Черниговская губерния|Черниговской губернии]]&lt;ref&gt;[http://genealogy-ua.com/%D0%B4%D0%B2%D0%BE%D1%80%D1%8F%D0%BD%D0%B5-%D1%87%D0%B5%D1%80%D0%BD%D0%B8%D0%B3%D0%BE%D0%B2%D1%81%D0%BA%D0%BE%D0%B9-%D0%B3%D1%83%D0%B1%D0%B5%D1%80%D0%BD%D0%B8%D0%B8/ Дворяне Черниговской губернии]&lt;/ref&gt;.\n']
['Кирово', '* [[Киров]]\n']
['Кировский район', '* [[Киров]]\n']
['Кировский', '* [[Киров]]\n']
['Киров

In [90]:
data

[['Волга (значения)',
  "* '''[[Волга]]'''\xa0— река в России, самая длинная река Европы.\n"],
 ['Ра (значения)',
  "* '''Ра''' ({{lang-la|Rha}})\xa0— название реки [[Волга|Волги]] у [[античность|античных]] авторов первых веков [[н. э.]] ([[Клавдий Птолемей]] и [[Аммиан Марцеллин]]).\n"],
 ['Лебяжий', '* Лебяжий рукав р. [[Волга]] (недалеко от г. Астрахань)\n'],
 ['Сухая Самарка',
  "* '''Сухая Самарка'''\xa0— протока, которую образует [[Волга]] вокруг острова Коровий.\n"],
 ['Приволжье',
  "'''Приволжье'''\xa0— местность, связанная с [[Волга|Волгой]] и бассейном этой реки.\n"],
 ['Воложка',
  "{{начало цитаты}}''Во́ложка'' распространённое название рукавов Волги, образующихся чаще всего после половодья, ''поволжск''. По второму полногласию\xa0— из *Вължька «маленькая [[Волга]]»{{конец цитаты|источник=Фасмер М.&lt;ref&gt;{{Фасмер|Воложка|том=1|страницы=341|ref=Фасмер}}&lt;/ref&gt;}}\n"],
 ['Малая Чёрная',
  '* Малая Чёрная — река в Астраханской области, в дельте [[Волга|Волги]].\n'],
 

In [86]:
list(ambiguous_words)[:10]

['Жезл',
 'Paranoid',
 'Генерал-адмирал Апраксин (броненосец)',
 'Измайловка (Краснодарский край)',
 'Амангельды (Енбекшиказахский район)',
 'Инвернесс (графство, Новая Шотландия)',
 'Акжар (Казыгуртский район)',
 'Козельское (Калужская область)',
 'Маппеты (фильм, 1979)',
 'Ивковцы (Черниговская область)']

In [83]:
'Сухой Донец' in title2id

True

In [13]:
redirect_pages[ref_title]

'Горьковский автомобильный завод'

In [None]:
file_handler = decode_open(dump_path)

for i, data in enumerate(tqdm(collect_pages(file_handler))):
    title = data[2]
    if title == 'Урал (телерадиокомпания)':
        break

In [None]:
data

In [None]:
len(ambiguous_words)

In [None]:
len(title2disamb)

In [None]:
len([title for title in ambiguous_words if title in title2id and title not in title2disamb])

In [None]:
len([title for title in title2disamb if title in title2id and title not in ambiguous_words])

In [None]:
redirect_pages['Урал (телерадиокомпания)']

In [None]:
title2disamb['Урал (телерадиокомпания)']

In [None]:
redirect_pages['Урал (телерадиокомпания)']

In [None]:
[title for title in title2disamb if title in title2id and title not in ambiguous_words][910:920]

In [None]:
data_path = r'D:\WorkFolder\data\ruwiki-20220701-pages-meta-current.xml.bz2'
file_handler = decode_open(data_path)
short_docs = []
redirect_docs = []
redirect_pages = {}
for i, data in enumerate(tqdm(collect_pages(file_handler))):
    #lines = data[3]
    #text = ''.join(lines)
    #page_id = data[0]
    title = data[2]
    redirects = get_redirects(data[3])
    if len(redirects) > 0:
        redirect_pages[title] = redirects
    elif len(''.join(data[3])) < 100:
        short_docs.append(data)
        
    if i == 100000:
        break

In [None]:
data

In [None]:
for 
get_redirects(redirect_docs[1][3])

In [None]:
redirect_docs[:10]

In [None]:
len(short_docs)

In [None]:
short_docs[:100]

In [None]:
max([len(''.join(d[3])) for d in redirect_docs])

In [None]:
[d for d in redirect_docs if len(''.join(d[3])) > 200]

In [None]:
data[3]

In [None]:
get_references(data[3])

In [None]:
'Второй бой в заливе Сирт'

In [None]:
len([title for title in ambiguous_words if title in title2id and title not in title2disamb])

In [None]:
len([title for title in title2disamb if title in title2id and title not in ambiguous_words])

In [None]:
[title for title in title2disamb if title in title2id and title not in ambiguous_words]

In [None]:
print(list(title2disamb.keys())[:10])

In [None]:
[w for w in title2disamb if '#' in w]

In [None]:
regex_bracers = re.compile(r'\{\{(.*?)\}\}')
bracers_content_stat = {}
for title, page_data in tqdm(title2id.items()):
    for bracers_content in re.findall(regex_bracers, page_data[1]):
        bracers_content = bracers_content.strip()
        if '|' in bracers_content:
            bracers_content = bracers_content[:bracers_content.find('|')].strip()
        if len(bracers_content) > 0:
            if bracers_content not in bracers_content_stat:
                bracers_content_stat[bracers_content] = 0
            bracers_content_stat[bracers_content] += 1

In [None]:
startswith = ['значения', 'другие значения', 'другое значение', 'значения3', 'значения2']

In [None]:
sorted([[bcontent, count] for bcontent, count in bracers_content_stat.items() if len([sw for sw in startswith if bcontent.lower().startswith(sw)])], key=lambda x: -x[1])[:100]

In [None]:
title2id['Волга'][1]

In [None]:
len(title2disamb), len(ambiguous_words)

In [None]:
title2disamb['Экономика']

In [None]:
title2id['Хор (мифология)']

In [None]:
redirect_phrases = ['#перенаправление']

In [None]:
meanings_regex2 = re.compile(r'\{\{другие значения(.*?)\|(.*?)\}\}|\{\{значения(.*?)\|(.*?)\}\}')

In [None]:
re.search(meanings_regex2, title2id['Малая Волга'][1].lower())

In [None]:
for title, amb in ambiguous_words.items():
    if len(amb[1]) > 1:
        break

In [None]:
title

In [None]:
amb

In [None]:
title2disamb['Гор (бог)']

In [None]:
title2id['Гор'][1]

In [None]:
ambiguous_words

In [None]:
[title for title in title2disamb if title in title2id and title not in ambiguous_words]

In [None]:
len([title for title in title2disamb if title in title2id and title not in ambiguous_words]), len(title2id)

In [None]:
100 * len([title for title in title2disamb if title in title2id and title in ambiguous_words]) / len(title2id)

In [None]:
title2disamb

In [None]:
ambiguous_words

In [None]:
data

In [None]:
def check_disambiguation_page(text):
    disambiguation_phrases = ['{{неоднозначность', '{{Многозначность', '{{Disambig']
    for phrase in disambiguation_phrases:
        if phrase in text:
            return True
    return False

In [None]:
data

In [None]:
len(polysemantic_words)

In [None]:
len(meanings_pages)

In [None]:
len(meanings_pages)

In [None]:
len(meanings_pages)

In [None]:
get_references(meanings_pages[10][3])

In [None]:
references_regex = re.compile(r'\[\[(.*?)\]\]')
re.findall(references_regex, ''.join(meanings_pages[10][3]))

In [None]:
meanings_pages[10]

In [None]:
len(meanings_pages)

In [None]:
meanings_pages[101]

In [None]:
[p for p in meanings_pages if 'Лук' in p[2]]

In [None]:
len(polysemantic_words)

In [None]:
len(polysemantic_words)

In [None]:
polysemantic_words[:100]

In [None]:
len(w2id)

In [None]:
data = meanings_pages[5]

In [None]:
import codecs
Extractor.to_json = True
with codecs.open('temp.txt', 'w', 'utf-8') as out:
    Extractor(data[0], data[1], '', data[2], data[3]).extract(out)

In [None]:
data

In [None]:
import re

meanings_regex = re.compile(r'\{\{другие значения\|(.*?)\}\}')
re.findall(meanings_regex, ''.join(data[3]))

In [None]:
''.join(data[3])[:100]

In [None]:
Extractor(data[0], data[1], '', data[2], data[3]).expandTemplates(''.join(data[3]))

In [None]:


data[3]

In [19]:
from mwsql import Dump
dump = Dump.from_file(r'D:\WorkFolder\data\ruwiki-20220701-redirect.sql.gz')


In [48]:
dump_pagelinks = Dump.from_file(r'D:\WorkFolder\data\ruwiki-20220701-pagelinks.sql.gz')

In [55]:
dump_pagelinks.head(1000)

['pl_from', 'pl_namespace', 'pl_title', 'pl_from_namespace']
['6062007', '0', '!', '0']
['6220905', '0', '!', '0']
['9248814', '0', '!', '0']
['85604', '0', '!!!', '0']
['452225', '0', '!!!', '0']
['902395', '0', '!!!', '0']
['1065760', '0', '!!!', '0']
['2014111', '0', '!!!', '0']
['2494612', '0', '!!!', '0']
['2731717', '0', '!!!', '0']
['4137541', '0', '!!!', '0']
['4708795', '0', '!!!', '0']
['5429598', '0', '!!!', '0']
['6062007', '0', '!!!', '0']
['6704143', '0', '!!!', '0']
['7381698', '0', '!!!!!!!', '0']
['7835247', '0', '!!!!!!!', '0']
['7835428', '0', '!!!!!!!', '0']
['7835429', '0', '!!!!!!!', '0']
['7835431', '0', '!!!!!!!', '0']
['7835432', '0', '!!!!!!!', '0']
['7835439', '0', '!!!!!!!', '0']
['7836246', '0', '!!!!!!!', '0']
['7836290', '0', '!!!!!!!', '0']
['7838350', '0', '!!!!!!!', '0']
['7910256', '0', '!!!!!!!', '0']
['7923392', '0', '!!!!!!!', '0']
['7960044', '0', '!!!!!!!', '0']
['8060753', '0', '!!!!!!!', '0']
['8060922', '0', '!!!!!!!', '0']
['8061226', '0', '!

In [59]:
import mwxml
dump = mwxml.Dump.from_file(decode_open(dump_path))

In [63]:
for p in dump.pages:
    break
p

Page(id=7, title='Литва', namespace=0, redirect=None, restrictions=[])

In [64]:
p.to_json()

{'id': 7, 'title': 'Литва', 'namespace': 0, 'restrictions': []}

In [58]:
dump_path

'D:\\WorkFolder\\data\\ruwiki-20220701-pages-meta-current.xml.bz2'

In [27]:
id2title = {id: title for title, id in title2id.items()}

In [47]:
redirect_titles_new = set()
for row in tqdm(dump.rows()):
    id_from = row[0]
    title_to = row[2].replace('_', ' ')

    if id_from in redirect_titles_new:
        print(1)
        print(row)
        break

    redirect_titles_new.add(id_from)
    if id_from not in id2title:
        print(2)
        print(row)
        break
    if title_to not in title2id:
        print(3)
        print(row)
        break
    if id2title[id_from] not in redirect_pages:
        print(4)
        print(row)
        break


103it [00:00, 4904.67it/s]

2
['1302', '4', 'Проекты', '', '']





In [46]:
title2id['Заглавная страница']

'4401'

In [44]:
id_from

'9787305'

In [43]:
'92' in redirect_titles_new

False

In [30]:
row

['9787479', '0', 'Подкожные_оводы', '', '']

In [34]:
len(redirect_pages), dump.size

(2606107, 35125865)

In [36]:
dump.head(100)

['rd_from', 'rd_namespace', 'rd_title', 'rd_interwiki', 'rd_fragment']
['4', '0', 'Заглавная_страница', '', '']
['92', '0', 'Санкт-Петербург', '', '']
['95', '0', 'Волга', '', '']
['113', '0', 'Содружество_наций', '', '']
['176', '0', 'Инфракрасная_спектроскопия', '', '']
['179', '0', 'Дифракция', '', '']
['181', '0', 'Дифракция_электронов', '', '']
['190', '0', 'Вирусы', '', '']
['199', '0', 'Ричард_III', '', '']
['203', '0', 'Мор,_Томас', '', '']
['205', '0', 'Компьютер', '', '']
['210', '0', 'Герб_России', '', '']
['220', '0', 'Ценная_бумага', '', '']
['257', '0', 'Сталин,_Иосиф_Виссарионович', '', '']
['262', '0', 'Язык_программирования', '', '']
['273', '0', 'Алюмогидрид_лития', '', '']
['282', '0', 'Нитрат_натрия', '', '']
['287', '0', 'Сульфат_железа(III)-аммония', '', '']
['289', '0', 'Сухой_лёд', '', '']
['290', '0', 'Сульфат_железа(III)-калия', '', '']
['293', '0', 'Сульфат_меди(II)', '', '']
['294', '0', 'Селитра', '', '']
['297', '0', 'Генераторный_газ', '', '']
['298', '0'

In [22]:
from ruwordnet import RuWordNet
wn = RuWordNet(filename_or_session=r'D:\WorkFolder\data\models\rwn-2021-05-02\ruwordnet.db')



In [23]:
for sense in wn.get_senses('Герц'):
    for hypernym in sense.synset.hypernyms:
        print(hypernym)
        for sister in hypernym.hyponyms:
            print(sister)

Synset(id="138637-N", title="ЕДИНИЦА ЧАСТОТЫ")
Synset(id="130554-N", title="ГЕРЦ (ЕДИНИЦА ИЗМЕРЕНИЯ)")
Synset(id="138088-N", title="МЕГАГЕРЦ")
Synset(id="138156-N", title="КИЛОГЕРЦ")
Synset(id="165881-N", title="ГИГАГЕРЦ")


In [24]:
print(wn.get_senses("Герц")[5].synset.hypernyms)

IndexError: list index out of range

In [25]:
wn.get_senses("Герц")

[Sense(id="130554-N-181134", name="ГЕРЦ")]