In [2]:
import json
import os
import re
import string
from glob import glob
from subprocess import check_output

import mwxml
import pandas as pd
import nltk
from tqdm import tqdm

In [3]:
xml_path = './shards'
xmlfile = glob(f"{xml_path}/*.xml-*")

xmlsingle = './enwiki-20220301-pages-articles-multistream.xml'
index = './enwiki-20220301-pages-articles-multistream-index.txt'

keywords_file = './keywords.txt'
stemmer = nltk.stem.porter.PorterStemmer()
with open(keywords_file, 'r') as f:
    keywords = eval(f.read())
    stem = [stemmer.stem(x) for x in keywords]
    keywords = set(stem + keywords)
    print(f'{len(keywords)} keywords with stems provided')

keywords_threshold = 0

nthreads = os.cpu_count()-1

76 keywords with stems provided


In [54]:
keywords

{'achiev',
 'achievement',
 'afterclap',
 'aftereffect',
 'aftermath',
 'aftershock',
 'assess',
 'assessment',
 'backwash',
 'blowback',
 'by-product',
 'causal',
 'causality',
 'conclus',
 'conclusion',
 'concuss',
 'concussion',
 'consequ',
 'consequence',
 'corollari',
 'corollary',
 'denouement',
 'develop',
 'development',
 'echo',
 'effect',
 'end',
 'epilogu',
 'epilogue',
 'fallout',
 'fate',
 'final',
 'fruit',
 'histor',
 'historical',
 'impact',
 'imping',
 'impingement',
 'implic',
 'implication',
 'impress',
 'impression',
 'influenc',
 'influence',
 'issu',
 'issue',
 'jounc',
 'jounce',
 'mark',
 'offshoot',
 'outcom',
 'outcome',
 'outgrowth',
 'precipit',
 'precipitate',
 'product',
 'production',
 'ramif',
 'ramification',
 'repercuss',
 'repercussion',
 'result',
 'resultant',
 'rippl',
 'ripple',
 'sequel',
 'sequenc',
 'sequence',
 'side effect',
 'side react',
 'side reaction',
 'signific',
 'significance',
 'spin-off',
 'sway',
 'upshot'}

In [4]:
punc_space = re.compile("[^" + string.punctuation + " \t\n\r\f\v]+")
p = re.compile("{{[mM]ain(?: article)?\|(?P<backlink>[^\<\[\}]*)}}")
h = re.compile("={2,}(?P<heading>[^\[=]*)={2,}")

def wc(filename):
    return int(check_output(["wc", "-l", filename]).split()[0])

def map_main_article(dump, path):
    for page in dump:
        if page.namespace == 0:
            for rev in page:
                links = []
                headings = []
                if rev.text is not None:
                    headings = h.findall(rev.text)
                    words = [w for hd in headings for w in punc_space.findall(hd.lower())] # stemmer.stem(w) will give more hits
                    hit = False
                    if keywords is not None:
                        keywords_match = 0
                        for each in words:
                            if keywords_match > keywords_threshold:
                                hit = True
                                break
                            if each in keywords:
                                keywords_match += 1

                    links = p.findall(rev.text)
                    if len(links) != 0:
                        links = [e.strip() for l in links for e in l.split('|')]

                yield page.id, page.title, links, headings, words, hit

        else:
            yield None, None, None, None, None, None


In [5]:
pbar = tqdm(total=wc(index))

main_articles = []
hits = []
count = 0

for id, title, links, headings, words, hit in mwxml.map(map_main_article, xmlfile, threads=nthreads):
    pbar.update(1)
    
    if id is None:
        continue
    
    if len(links) != 0:
        count += 1
        main_articles.append((id, title, str(links), str(headings)))
    
    if hit:
        hits.append((id, title, str(links), str(headings)))

100%|████████████████████████████████████████████████████████████████████▉| 21895616/21895737 [16:38<00:00, 5274.14it/s]

In [6]:
# put all hits by main-article-trick into dict
pool1 = {}
repeat_pool1 = {}

for each in tqdm(main_articles):
    _, title, links, _ = each
    links = eval(links)
    if title not in pool1:
        pool1[title] = links
    else:
        repeat_pool1[title] = links


  0%|                                                                                        | 0/284117 [00:00<?, ?it/s][A
  4%|██▋                                                                     | 10676/284117 [00:00<00:02, 106754.50it/s][A
  8%|█████▍                                                                   | 21352/284117 [00:00<00:04, 55179.80it/s][A
 12%|████████▍                                                                | 33073/284117 [00:00<00:03, 73962.50it/s][A
 16%|███████████▍                                                             | 44659/284117 [00:00<00:02, 86607.64it/s][A
 20%|██████████████▋                                                          | 56959/284117 [00:00<00:02, 97572.85it/s][A
 25%|█████████████████▊                                                      | 70064/284117 [00:00<00:01, 107645.83it/s][A
 29%|█████████████████████                                                   | 83006/284117 [00:00<00:01, 114191.25it/s][A
 33%|██

In [8]:
# put all hits by keywords into dict
pool2 = {}
repeat_pool2 = {}

for each in tqdm(hits):
    _, title, links, _ = each
    links = eval(links)
    if title not in pool2:
        pool2[title] = links
    else:
        repeat_pool2[title] = links


  0%|                                                                                        | 0/402785 [00:00<?, ?it/s][A
  5%|███▎                                                                    | 18513/402785 [00:00<00:02, 185083.84it/s][A
  9%|██████▊                                                                 | 38153/402785 [00:00<00:01, 191727.19it/s][A
 14%|██████████▏                                                             | 57326/402785 [00:00<00:03, 104457.90it/s][A
 20%|██████████████                                                          | 78766/402785 [00:00<00:02, 133281.88it/s][A
 25%|█████████████████▊                                                     | 100744/402785 [00:00<00:01, 156850.63it/s][A
 31%|█████████████████████▊                                                 | 123441/402785 [00:00<00:01, 176495.69it/s][A
 36%|█████████████████████████▊                                             | 146169/402785 [00:00<00:01, 191011.03it/s][A
 42%|██

In [9]:
len(repeat_pool1)

0

In [31]:
pool1['Frederick the Great']

['First Silesian War',
 'Second Silesian War',
 'Third Silesian War',
 'First Partition of Poland',
 'Prussian Partition',
 'War of the Bavarian Succession']

In [10]:
len(repeat_pool2)

0

In [32]:
pool2['Frederick the Great'] # 关键词筛选还要修一下

KeyError: 'Frederick the Great'

In [42]:
# prune links from pool1: if articles in links also present in pool2, leave it intact
# can have any number of keyword-hit main-article links
pool3 = {}
total = 0
for title, links in tqdm(pool1.items()):
    pruned = []
    for l in links:
        if l in pool2:
            pruned.append(l)
    if len(pruned) > 0:
        total += len(pruned)
        pool3[title] = pruned


  0%|                                                                                        | 0/284117 [00:00<?, ?it/s][A
 24%|████████████████▉                                                       | 66884/284117 [00:00<00:00, 668784.83it/s][A
 54%|██████████████████████████████████████▎                                | 153319/284117 [00:00<00:00, 783803.21it/s][A
100%|███████████████████████████████████████████████████████████████████████| 284117/284117 [00:00<00:00, 817215.24it/s][A


In [43]:
len(pool3)

81823

In [23]:
total

148229

In [55]:
pool3["Julius Caesar"]

['Military campaigns of Julius Caesar',
 'First Triumvirate',
 'Gallic Wars',
 "Caesar's Civil War",
 'Assassination of Julius Caesar',
 'Caesarism']

In [44]:
with open('pages-with-any-main-articles-with-keywords.json', 'w') as f:
    json.dump(pool3, f)

In [45]:
# prune links from pool1: if articles in links are also in pool2, leave it intact
# must have more than 1 keyword-hit main-article links
pool4 = {}
total = 0
for title, links in tqdm(pool1.items()):
    pruned = []
    for l in links:
        if l in pool2:
            pruned.append(l)
    if len(pruned) > 1:
        total += len(pruned)
        pool4[title] = pruned


  0%|                                                                                        | 0/284117 [00:00<?, ?it/s][A
 28%|████████████████████                                                    | 79177/284117 [00:00<00:00, 791734.89it/s][A
 61%|███████████████████████████████████████████▌                           | 174363/284117 [00:00<00:00, 885896.33it/s][A
100%|███████████████████████████████████████████████████████████████████████| 284117/284117 [00:00<00:00, 896698.66it/s][A


In [46]:
with open('pages-with-two-main-articles-with-keywords.json', 'w') as f:
    json.dump(pool4, f)

In [30]:
pool4['Frederick the Great']

['Third Silesian War',
 'First Partition of Poland',
 'War of the Bavarian Succession']

In [47]:
# check article appearance as backlinks
pool5 = {}

for title, links in tqdm(pool3.items()):
    for l in links:
        if l in pool5:
            pool5[l].add(title)
        else:
            pool5[l] = {title}


  0%|                                                                                         | 0/81823 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████████████████████████████████| 81823/81823 [00:00<00:00, 683696.88it/s][A


In [48]:
len(pool5)

70403

In [51]:
more_than_one = 0
pool6 = {}
for title, appear in pool5.items():
    if len(appear) > 1:
        more_than_one += 1
#         print(len(appear), title, appear)
        pool6[title] = list(appear)
print(more_than_one)
len(pool6)

24199


24199

In [53]:
pool6["Bishops' Wars"]

['Anglo-Scottish war (1650–1652)',
 'Scotland in the early modern period',
 'Treaty of Ripon',
 'Battle of Preston (1648)',
 'History of Scotland',
 'Charles I of England',
 'Second English Civil War']

In [52]:
with open('pages-appears-more-than-once.json', 'w') as f:
    json.dump(pool6, f)