# WIKIVITALS Processing

The aim of this notebook is to get all the articles from Wikivitals level 5 using :
- the lists defined in this page () and all the redirections
- a wikidump

In [4]:
import wikipedia
# Note: also install wikipedia_sections (pip install wikipedia_sections)
from bs4 import BeautifulSoup, NavigableString, Tag
import random
from collections import Counter

from bz2 import BZ2File
from lxml import etree as et
import sys
import os
import re
import unidecode
import wikitextparser

In [5]:
extract_from_dump = False

use_vitals_from_web = False

wikivitals_root_page = "Wikipedia:Vital articles/Level/5 - Wikipedia"
wikivitals_root_page_id = 55702953

wikidump_splits_path = "C:/Users/Antoine/Downloads/enwiki-split"

# I used the wikipedia library to find it (the only online call)
page = wikipedia.page(wikivitals_root_page)
print(page.pageid)

22819068


# Quelques utilitaires

In [7]:
# UTILS

# Namespaces defined here: https://en.wikipedia.org/wiki/Wikipedia:Namespace
# 'Image:' added 

namespaces = ['Talk:','User:','User talk:', 'Wikipedia:','Wikipedia talk:',
    'File:','File talk:','MediaWiki:','MediaWiki talk:','Template:','Template talk:',
    'Help:','Help talk:','Category:','Category talk:','Portal:','Portal talk:','Draft:',
    'Draft talk:','TimedText:','TimedText talk:','Module:','Module talk:', 'Image:']
namespaces = namespaces + [i.lower() for i in namespaces]
namespaces = namespaces + [':'+ i for i in namespaces]
# wikiprojects = ['wikt:', 'Wiktionary:', 's:', 'wikisource:', 'w:', 'iarchive:', 'b:']
# wikiprojects = wikiprojects + [i.lower() for i in wikiprojects] 
# others = ['doi:']


def wikilinks_namespace0(wikitext, string_format = True):
    """
    Extract links with namespace 0 in wikitext 

    Input:
    * wikitext: str 

    Output:
    * links: list of str (list of canonical name of pages)
    """
    # Update the prefix used to exclude links that are in another namespaces or links to another wikiprojects
    # tmp = namespaces + wikiprojects + others
    # prefixes = tmp +  [':'+i for i in tmp]
    prefixes = tuple(namespaces)

    links_ = wikitextparser.parse(wikitext).wikilinks
    if string_format:
        links = [s.title.strip() for s in links_ if not s.title.strip().startswith(prefixes)]
        excluded = [s.title.strip() for s in links_ if s.title.strip().startswith(prefixes)]
    else:
        links = [s for s in links_ if not s.title.strip().startswith(prefixes)]
        excluded = [s for s in links_ if s.title.strip().startswith(prefixes)]
    return(links, excluded)

# def wikilinks_images(wikitext, string_format = True):
#     """
#     Extract links with namespace 0 in wikitext 

#     Input:
#     * wikitext: str 

#     Output:
#     * links: list of str (list of canonical name of pages)
#     """
#     links_ = wikitextparser.parse(wikitext).wikilinks
#     if string_format:
#         links = [s.title for s in links_ if s.title.startswith('Image:')]
#     else:
#         links = [s for s in links_ if s.title.startswith('Image:')]
#     return(links)

def normalize_labels(wikitext):
    cleaned_string = wikitextparser.remove_markup(wikitext.title)
    return(cleaned_string.split(' (')[0]) # WHY ?

def get_highest_level_headers(wikitext, highest_level_authorized = 1):
    sections_ = wikitextparser.parse(wikitext).sections

    # Get all headers (all those with level at least equal to h1)
    header_level = [(h, (len(str(h)) - len(str(h).lstrip('=')))) for h in sections_ if not str(h) == '']
    header_level = [(h, length) for h,length in header_level if length >= highest_level_authorized]
    # Get the "highest level" among the headers (h1 > h2 > h3 > h4 > h5 > h6)
    try:
        highest_level = min([i for j,i in header_level])
    except:
        highest_level = 0 #if list empty
    # Keep only the highest level headers 
    # Note that h is wikitext of a Section = its content 
    filtered_headers = [h for h, j in header_level if j==highest_level]
    return filtered_headers

# Extraction of Vital articles

## Vital articles

From Wikipedia:
"Vital articles are lists of subjects for which the English Wikipedia should have corresponding featured-class articles. They serve as centralized watchlists to track the quality status of **Wikipedia's most important articles** and to give editors guidance on which articles to prioritize for improvement."

The Vital articles of level 5 list **about 50,000 articles** (the target value) that have been **manually categorized** by the wikipedia editors. 

A root page contains the links to the different Vital Articles that are in the namespace 'Wikipedia' (number 4). 
- [the namespaces of wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Namespace)
- [the root page](https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5) (be careful, here we use a dump frozen at a certain date to extract the information, this page can be different from the one extracted in this work)

## Extraction

Objectives:
- Collect and store the wiki ids of level 5 Vital articles 
- Extract the wikitext from the Vital level 5 articles and store it

Outputs:
- In the folder './outputs' the file 'wikivitals_pageid.txt' contains the wiki ids and the canonical names of the Vital articles of level 5  
- In the folder './wikivitals-pages-wikitext' are all the wikitext of the Vital articles (32 files + the root page)

In [21]:
import mwxml
import mwtypes
import wikitextparser
# Note: also install wikipedia_sections (pip install wikipedia_sections)


# Main page wikivitals:
wikipedia_root_page_wt = open("./outputs/wikivitals-pages-wikitext/Vital articles-Level-5.wikitxt", 'r', encoding='utf8')
wikitext = wikipedia_root_page_wt.read()
wikipedia_root_page_wt.close()

# Extract links in the page from the wikitext
_, excluded = wikilinks_namespace0(wikitext) # Vital articles are NOT in namespace 0

# All links to Vital articles starts with 'Wikipedia:Vital articles/Level/5/'
# Once find, we store in a file the wiki id of each Vital article for later use
vital_articles_wikiids = []
outfile = open("./outputs/wikivitals_pageid.txt", 'w')
for i in excluded:
    if i.startswith('Wikipedia:Vital articles/Level/5/'):
        print(i)
        wikiid = wikipedia.page(i).pageid
        vital_articles_wikiids.append(int(wikiid))
        outfile.write(f'{wikiid}\t{i}\n')
        
outfile.close()
print('\n')
print(f'{len(vital_articles_wikiids)} Vital articles (level 5) found')
print('Wiki IDs have been stored') 


# Each split of the dump has the same format:
# enwiki-20220401-pages-articles{split_part}.xml-p{start_id}p{end_id}
# where {start_id} and {end_id} # define the interval of wiki id in this 
# split of the dump.
# 
# In order not to explore the whole dump to extract the wikitext content 
# of the Vital articles, we will identify from the ids of these pages the parts 
# of the split to browse
dump_files = os.listdir(wikidump_splits_path)
intervals = [s[:-4].split('.xml-p')[1].split('p') for s in dump_files]
intervals = [(int(i), int(j)) for i,j in intervals]
mask = [False]*len(intervals)
for wikiid in vital_articles_wikiids:
    for k in range(len(intervals)):
        if (intervals[k][0] <= wikiid <= intervals[k][1]):
            mask[k] = True
# files_selected = list of dump parts to explore
files_selected = [wikidump_splits_path + '/' + dump_files[k] for k in range(len(dump_files)) if mask[k]] 



def page_info(dump, path):
    """
    Read the dump page per page
    (can be paralellized a priori - can't make it work)

    Input: 
    * dump: list of Wikidumps (with format .xml.bz2)
    * path: str - path to Wikidump    
    """
    for page in dump:
        tstamp = mwtypes.Timestamp(0)
        if page.id in page_ids:
            for revision in page:
                if revision.timestamp > tstamp:
                    last_revision = revision
                    tstamp = revision.timestamp
                text = last_revision.text
            yield page.id, page.title, page.redirect, text


# Reads each part of the dump that has been identified as containing the 
# wikitext of at least one Vital article in order to extract these wikitexts 
# and save them in the './outputs/wikivitals-pages-wikitext/' folder
for file in files_selected:
    for id, title, redirect, text in mwxml.map(page_info, [file]):
        
        title_ = title.replace('/', '-')
        filename = "./outputs/wikivitals-pages-wikitext/" + title_ + ".wikitxt"
        f = open(filename, "w", encoding = 'utf8')
        f.write(text)
        f.close()

Wikipedia:Vital articles/Level/5/People/Writers and journalists
Wikipedia:Vital articles/Level/5/People/Artists, musicians, and composers
Wikipedia:Vital articles/Level/5/People/Entertainers, directors, producers, and screenwriters
Wikipedia:Vital articles/Level/5/People/Philosophers, historians, political and social scientists
Wikipedia:Vital articles/Level/5/People/Religious figures
Wikipedia:Vital articles/Level/5/People/Politicians and leaders
Wikipedia:Vital articles/Level/5/People/Military personnel, revolutionaries, and activists
Wikipedia:Vital articles/Level/5/People/Scientists, inventors, and mathematicians
Wikipedia:Vital articles/Level/5/People/Sports figures
Wikipedia:Vital articles/Level/5/People/Miscellaneous
Wikipedia:Vital articles/Level/5/History
Wikipedia:Vital articles/Level/5/Geography/Physical
Wikipedia:Vital articles/Level/5/Geography/Countries
Wikipedia:Vital articles/Level/5/Geography/Cities
Wikipedia:Vital articles/Level/5/Arts
Wikipedia:Vital articles/Level/5

## Extract the classes (level 0 and 1)

All links have the same format Wikipedia:Vital articles/Level/5/' + class0 + '/' + class1

Ex: 'Wikipedia:Vital articles/Level/5/People/Entertainers, directors, producers, and screenwriters'
We want to extract {'class0': 'People', 'class1':'Entertainers, directors, producers, and screenwriters'}

In [5]:
# Extract the classes (level 0 and 1)

start_str = 'Wikipedia:Vital articles/Level/5/'

lclasses = dict()
for link in links_str:
    classes = link.strip()[len(start_str):].split('/')
    if len(classes) == 1:
        classes.append(classes[0])
    lclasses[link] = {
        'class0': classes[0], 
        'class1': classes[1]
        }

# Display an example
print(f'{list(lclasses.keys())[0]}:{lclasses[list(lclasses.keys())[0]]}')




Wikipedia:Vital articles/Level/5/People/Artists, musicians, and composers:{'class0': 'People', 'class1': 'Artists, musicians, and composers'}


Extract the Wikivitals pages from the dump

In [25]:
# import mwxml
# import mwtypes
# import wikitextparser
# # Note: also install wikipedia_sections (pip install wikipedia_sections)


# import re

# # # Internal path to the Wikipedia dump 
# # path  = "C:/Users/Antoine/Downloads/" + "enwiki-20220401-pages-articles.xml.bz2"
# # # path =  "C:/Users/Antoine/Downloads/" + "enwiki-20220401-pages-articles24.xml-p55064554p56564553.bz2"


# page_ids = list(pageid_wikivitals.keys())


# # # Matches all h1 titles (h-infinite in reality but no matter)
# # # Title h4 in Wikitext: ==== a h4 title ====
# # h_2_5_title = re.compile(r'\n(===*([^=].+?)=*[==])') # group 1: the header with marks, group 2 the header content


# # # LINKS

# # # Matches links with no nested link
# # # [[a link]] => IT'S A MATCH!
# # # [[This is [[a link]] in a link]] => SORRY, NO MATCH!
# # # Basic structure of a link in wikitext : [[canonical name | name used in context]]
# # internal_link_1_level = re.compile(r"\[\[(([^\[\]|#]*)[#]*[^\[\]|]*[|]*?[^\[\]|]*?)\]\]") # group 1: the content of the link, group 2: the canonical name
# # # Matches 'File' links (without any nested link)
# # file_link_1_level = re.compile(r"\[\[File:[^\[]*?\]\]") 
# # # Matches 'Category' links (without any nested link)
# # category_link_1_level = re.compile(r"\[\[Category:([^\[]*?)\]\]") # group 1: the category name
# # # Matches 'Wikipedia' links (without any nested link) 
# # # Wikipedia links are NOT links to other articles we want to keep
# # wikipedia_link_1_level = re.compile(r"\[\[Wikipedia:[^\[]*?\]\]")
# # # Matches 'Image' links (without any nested link)
# # image_link_1_level = re.compile(r"\[\[Image:[^\[]*?\]\]")


# # def find_internal_links(T):
# #     """
# #     Extract and replace / remove links
# #     Image, Wikipedia (=/= links to other wikipedia pages), and File links are removed
# #     Category links are replaced by their value
# #     Links to other wikipedia articles are replaced by the canonical name of the page

# #     Input: 
# #     * T: a text in wikitext

# #     Output:
# #     * (L, T_): 
# #         L: List of canonical names of pages
# #         T_: Text with links removed or replaced    
# #     """
# #     tmp = T
# #     links = []
# #     # We allow a depth of search for nesting of 10
# #     for i in range(5):
# #         # replace category by the category name
# #         tmp = category_link_1_level.sub(r'\1', tmp)
# #         # remove files
# #         tmp = file_link_1_level.sub(r' ', tmp)
# #         # remove wikipedia links
# #         tmp = wikipedia_link_1_level.sub(r' ', tmp)
# #         # remove image links
# #         tmp = image_link_1_level.sub(r' ', tmp)
# #         # Find all remaining links
# #         links += internal_link_1_level.findall(tmp)
# #         # replace links without nested links in the text by their content
# #         tmp = internal_link_1_level.sub((r'\2'), tmp)
# #     return [l for l, _ in links], tmp




# files = [wikidump_splits_path + '/' + name for name in list(file_2_pages.keys()) ]

# def page_info(dump, path):
#     """
#     Read the dump page per page
#     (can be paralellized a priori - can't make it work)

#     Input: 
#     * dump: list of Wikidumps (with format .xml.bz2)
#     * path: (unused) str - path to Wikidump    
#     """
#     cnt = 0
#     for page in dump:
#         cnt +=1
#         if cnt%10000 == 0:
#             print(cnt)
#         tstamp = mwtypes.Timestamp(0)
#         if page.id in page_ids:
#             for revision in page:
#                 if revision.timestamp > tstamp:
#                     last_revision = revision
#                     tstamp = revision.timestamp
#                 text = last_revision.text
#             yield page.id, page.title, page.redirect, text


# for file in files:
#     for id, title, redirect, text in mwxml.map(page_info, [file]):
        
#         title_ = title.replace('/', '-')
#         filename = "./outputs/wikivitals-pages-wikitext/" + title_ + ".wikitxt"
#         f = open(filename, "w", encoding = 'utf8')
#         f.write(text)
#         # print(text, file=f)
#         f.close()



# Identification of the vital articles

In [96]:
import wikitextparser
import unidecode
import re

count_headers = 0


# Will be used to store the class hierarchy
labels_hierarchy = {'num_articles_expected': 0, 'num_articles': 0}
# Will be used to store the pages canonical names and their labels
articles_classification = dict()


dir = './outputs/wikivitals-pages-wikitext/'
files = os.listdir(dir)
for file in files: # Read each Vital article one by one
    f = open(dir + file, 'r', encoding='utf8')
    wikitext = f.read()
    f.close()

    # Get the "highest level" among the headers (h1 > h2 > h3 > h4 > h5 > h6)
    filtered_headers = get_highest_level_headers(wikitext)

    count_headers += len(filtered_headers)
    

    # Extract class 0 and 1 from file name
    # class 2 elements are highest level header titles
    # and add them to class hierarchy
    filename_split = file[:-8].split('-')
    class0 = filename_split[3]
    if len(filename_split) == 5:
        class1 = filename_split[4]
    else:
        class1 = class0
    classes2 = [normalize_labels(s) for s in filtered_headers]
    if not class0 in labels_hierarchy.keys():
        labels_hierarchy[class0] = {class1: {'num_articles_expected': 0, 'num_articles': 0}, 'num_articles_expected': 0, 'num_articles': 0}
    else:
        labels_hierarchy[class0][class1] = {'num_articles_expected': 0, 'num_articles': 0}
    for class2 in classes2:
        labels_hierarchy[class0][class1][class2] = {'num_articles_expected': 0, 'num_articles': 0}
    

    headers_n_links = [ (normalize_labels(s), wikilinks_namespace0(str(s))[0]) for s in filtered_headers]
    for class2, links in headers_n_links:
        # Update the expected count of articles in the label hierarchy
        labels_hierarchy[class0][class1][class2]['num_articles_expected'] = len(links)
        labels_hierarchy[class0][class1]['num_articles_expected'] += len(links)
        labels_hierarchy[class0]['num_articles_expected'] += len(links)
        labels_hierarchy['num_articles_expected'] += len(links)

        for link in links:
            articles_classification[link] = {
                'class0': class0,
                'class1': class1,
                'class2': class2
            }

print('\n')
print(f'Number of articles in hierarchy: {labels_hierarchy["num_articles_expected"]}')
print(f'Number of articles recovered: {len(list(articles_classification.keys()))}')
print(f'Number of duplicates in the hierarchy: {labels_hierarchy["num_articles_expected"] - len(list(articles_classification.keys()))}')  


# store classes hierarchy
outfile_classification = "./outputs/classification_unfiltered.txt"
f = open(outfile_classification, 'w', encoding='utf8')
for k, v in articles_classification.items():
    f.write(f'{k}\t{v["class0"]}\t{v["class1"]}\t{v["class2"]}\n')
f.close()








Number of articles in hierarchy: 48649
Number of articles recovered: 48596
Number of duplicates in the hierarchy: 53


## Filtering the list of articles found

We'll use a dump to do this


Here we extract the full wikitext of the articles

In [99]:
import mwxml
import mwtypes
import wikitextparser
# Note: also install wikipedia_sections (pip install wikipedia_sections)

import re

# Internal path to the Wikipedia dump split
wikidump_splits_path = "C:/Users/Antoine/Downloads/enwiki-split"
dump_files = os.listdir(wikidump_splits_path)
files = [wikidump_splits_path + '/' + f for f in dump_files]


# Set of all article canonical titles
article_titles = set(list(articles_classification.keys()))



def page_info(dump, path):
    """
    Read the dump page per page. Yield infos (id, title, redirect, and text)
    only for pages in the set of wikivitals articles (global var)
    (can be paralellized, can't make it work)

    Input: 
    * dump: list of Wikidumps (with format .xml.bz2)
    * path: str - path to Wikidump split   
    """
    for page in dump:
        tstamp = mwtypes.Timestamp(0)
        if page.title in article_titles and page.redirect == None and page.namespace == 0:
            for revision in page:
                if revision.timestamp > tstamp:
                    last_revision = revision
                    tstamp = revision.timestamp
                text = last_revision.text
            yield page.id, page.title, page.redirect, text


# If some articles are redirections, we'll exclude them from our set of articles
redirections = []
articles_content = dict()
counter = 0
articles_found = []
articles_wikitexts = []
outdir_wikitexts = './outputs/wikivitals_raw_wikitexts/'
wikiid_title_file = './outputs/wikiid_title.txt'
g = open(wikiid_title_file, 'w', encoding='utf8')

for file in files:
    print(file)
    for id, title, redirect, wikitext in mwxml.map(page_info, [file]):




        # Exclude redirections 
        if not redirect == None:
            redirections.append(title)
        else:
    
            # if (not title in articles_content.keys()) or (articles_content[title]['clean abstract'] == None):
            articles_content[title] = {
                'id': id,
                'title': title,
                'class0': articles_classification[title]['class0'],
                'class1': articles_classification[title]['class1'],
                'class2': articles_classification[title]['class2']
            }
            articles_found.append(title)
            # articles_wikitexts.append(wikitext)
            counter +=1
            if counter%1000 == 0:
                print(f'{counter} articles found')
            filename = outdir_wikitexts + f'{id}.wt' 
            f = open(filename, 'w', encoding='utf8')
            f.write(wikitext)
            f.close()
            g.write(f'{id}\t{title}\n')
g.close()

print(len(list(articles_content.keys())))

C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles1.xml-p1p41242.bz2
1000 articles found
2000 articles found
3000 articles found
4000 articles found
5000 articles found
6000 articles found
7000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles10.xml-p4045403p5399366.bz2
8000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p5399367p6899366.bz2
9000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p6899367p7054859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p7054860p8554859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p8554860p9172788.bz2
10000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p10672789p11659682.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p9172789p10672788.bz2
C:/Users/Antoine/Down

Here we extract abstracts, headers, etc.

In [8]:
import mwxml
import mwtypes
import wikitextparser
import mwparserfromhell
from bs4 import BeautifulSoup
# Note: also install wikipedia_sections (pip install wikipedia_sections)

outdir_wikitexts = './outputs/wikivitals_raw_wikitexts/'
# outdir_wikitexts = './outputs/wikivitals_manual_changes/'
wikitext_files = os.listdir(outdir_wikitexts)
files = [outdir_wikitexts + f for f in wikitext_files]
outfile_headers = open('./outputs/__headers.txt', 'w', encoding='utf8')
outfile_links = open('./outputs/__links.txt', 'w',encoding='utf8')
# outfile_abstracts = open('./outputs/__abstracts.txt', 'w', encoding='utf8')

raw_abstract_kept = [] # store the list of articles the complete wikitext has been kept as an abstract

# # adjustment1 = re.compile(r'thumb\|.*?\n')
# adjustment2 = re.compile(r'\{\|.*\|\}', re.DOTALL)
# adjustment3 = re.compile(r'\[\[.*\]\]', re.DOTALL)
# references = re.compile(r'<ref.*?</ref>', re.DOTALL)
# # adjustment4 = re.compile(r'^.*\|.*?\n', re.DOTALL)

# namespaces = ['Talk:','User:','User talk:', 'Wikipedia:','Wikipedia talk:',
#     'File:','File talk:','MediaWiki:','MediaWiki talk:','Template:','Template talk:',
#     'Help:','Help talk:','Category:','Category talk:','Portal:','Portal talk:','Draft:',
#     'Draft talk:','TimedText:','TimedText talk:','Module:','Module talk:', 'Image:']
# namespaces = namespaces + [i.lower() for i in namespaces]
# wikiprojects = ['wikt:', 'Wiktionary:', 's:', 'wikisource:', 'w:', 'iarchive:', 'b:']
# wikiprojects = wikiprojects + [i.lower() for i in wikiprojects]
# languages = ['de:', 'fr:', 'nl:', 'it:', 'da:', 'ja:', 'pl:'] 
# others = ['doi:']
# tmp = namespaces + wikiprojects + languages + others 
# prefixes = tmp + [':' + i for i in tmp]



# def remove_first_root_templates(wikicode):
#     new_wikicode = wikicode.strip()
#     if not new_wikicode.startswith('{'):
#         return new_wikicode
#     if new_wikicode.startswith('{{'):
#         s_start, s_end = '{{', '}}'
#     if node_str.startswith('{|class="wikitable"'):
#         s_start, s_end = '{|class="wikitable"', '|}'
#     length = len(new_wikicode)
#     index = 0
#     counter = 0
#     while index < length:

# def replace_foreign_language(soup):
#     # See the templates here: https://en.wikipedia.org/wiki/Template:Nihongo
#     # {{Nihongo|<english>|<kanji/kana>|<rōmaji>|lead=yes|extra=<extra>|extra2=<extra2>}}
#     psoup = soup
#     tagid = 'xmltemplate'
#     # get the list of all tags ordered by "length"
#     # links are ordered in order to replace first the link inside links
#     # and don't miss one in the list when files are removed
#     ordered_tags = psoup.find_all(tagid)
#     ordered_tags = sorted(ordered_tags, key=lambda x: len(str(x)))

#     for l in ordered_tags:
#         # Japanese words
#         if l.text.startswith(('Nihongo|', 'nihongo|')):
#             try:
#                 tmp = l.text.split('|')
#                 romaji = tmp[1]
#                 if len(tmp) > 3:
#                     tmp = tmp[4:]
#                     tmp = [t.split('=')[-1] for t in tmp]
#                     tmp = [romaji] + tmp
#                 else:
#                     tmp = [romaji]
#                 l.replace_with(' '.join(tmp))
#             except:
#                 l.replace_with(' ')
#         elif l.text.startswith('lang|'):
#             tmp = l.text.split('|')
#             if len(tmp) >= 3:
#                 replacement_text = tmp[2]
#                 # print(f'wtext {l.text} replaced by {replacement_text}')
#                 l.replace_with(replacement_text)
                
#         elif l.text.startswith('lang-'):
#             tmp = l.text.split('|')
#             if len(tmp) >= 2:
#                 replacement_text = tmp[1]
#                 # print(f'wtext {l.text} replaced by {replacement_text}')
#                 l.replace_with(replacement_text)
#         else:
#             None
        
#     return psoup




def removeHTMLTaggedContent(soup, listOfTags = []):
    psoup = soup
    for tagid in listOfTags:
        # get the list of all tags ordered by "length"
        # links are ordered in order to replace first the link inside links
        # and don't miss one in the list when files are removed
        ordered_tags = psoup.find_all(tagid)
        ordered_tags = sorted(ordered_tags, key=lambda x: len(str(x)))

        for l in ordered_tags:
            l.decompose()
    return psoup
        
def updateInternalLinks(soup, use_replacement_text =  True):
    psoup = soup
    # get the list of all internal links ordered by "length"
    # links are ordered in order to replace first the link inside links
    # and don't miss one in the list when files are removed
    count_links_replaced, count_links_removed = 0, 0
    ordered_links = psoup.find_all('internallink')
    ordered_links = sorted(ordered_links, key=lambda x: len(str(x)))
    #Internal links follow this template:
    #<internallink> canonical page name | page name in context </internallink>
    #some links are prefixed with "category:", such prefixes are removed
    for l in ordered_links:
        if l.text.startswith(tuple(namespaces)):
            l.decompose()
            count_links_removed
        else:
            if use_replacement_text:
                canonical_link = l.text.split('|')[-1]
                # print(f'wtext {l.text} replaced by {canonical_link}')
            else:
                canonical_link = l.text.split('|')[0]
                # replace the tag by the canonical form or
                # remove the tag if it refers to a page in another namespace, 
                # another wiki project, or another language
                # if not canonical_link.startswith(tuple(prefixes)):
            l.replace_with(canonical_link) 
            count_links_replaced +=1

    return psoup, count_links_replaced, count_links_removed


# {{short description|Equation of statistical mechanics}}
# {{other uses|Boltzmann's entropy formula|Stefan–Boltzmann law|Maxwell–Boltzmann distribution}}
# {{redirect|BTE}}

# [[File:StairsOfReduction.svg|thumb|The place of the Boltzmann kinetic equation on the stairs of model reduction from microscopic dynamics to macroscopic continuum dynamics (illustration to the content of the book<ref>
# {{cite book |last1=Gorban |first1= Alexander N.|last2= Karlin |first2= Ilya V. |date=2005 |title= Invariant Manifolds for Physical and Chemical Kinetics|url= https://www.academia.edu/17378865| location= Berlin, Heidelberg |publisher= Springer|series= Lecture Notes in Physics (LNP, vol. 660)| isbn= 978-3-540-22684-0|doi= 10.1007/b98103}} [https://archive.org/details/gorban-karlin-lnp-2005 Alt URL]</ref>)]]


def process_abstract(wikitext, first_try = True):
    ptext = wikitext


    # see = False
    # if ptext.startswith('{{short description|Probability distribution and special case of gamma distribution}}'):
    #     see = True



    # removal of math equations before the replacement of 
    # {{ and }} tags (these tags can be found inside some
    # equations and introduce exceptions)
    ptext = ptext.replace('/>', ' />')
    psoup = BeautifulSoup(ptext, 'html.parser')
    psoup = removeHTMLTaggedContent(psoup,['math'])
    ptext = str(psoup)

    # if see:
    #     print(ptext)
    #replacement of some tags by HTML-like tags
    #such tags are handled like any other tags by Beautiful soup
    # starbox_begin = re.search('{{Starbox begin.*?}}', ptext)
    # starbox_end = re.search('{{Starbox end.*?}}', ptext)
    # if not starbox_begin == None and not starbox_end == None:
    #     ptext = ptext.replace(starbox_begin.group(0), '<xmltemplate>')
    #     ptext = ptext.replace(starbox_end.group(0), '</xmltemplate>')
    #     print("starbox detected")
    
    # #### VERY SPECIFIC TO WIKIPEDIA (MANUAL CHECKING)
    # #### IN SOME PLACES, THERE ARE LONELY '}}' SUBSTRINGS. I REMOVE THEM 
    # ptext = ptext.replace('(enter DEATH date then BIRTH date (e.g., ...|1908|31|8|1967|28|2}}', ' ') # for many articles (an example)
    # ptext = ptext.replace('<ref name="sd.news.cn"> url=http://sd.news.cn/news/2022-01/20/c_1128280781.htm}}</ref>', ' ') # for article 105032
    # # <ref name="jswx.gov.cn"> url=https://www.jswx.gov.cn/chuanbo/ping/202201/t20220120_2933372.shtml}}</ref>
    
    # First, let's remove notes and refs BEFORE replacing {{ and }} substrings
    # (in some notes or references, there are }} without any corresponding {{ before)
    ptext = ptext.replace('<!--', '<note>')
    ptext = ptext.replace('-->', '</note>')
    psoup = BeautifulSoup(ptext, 'html.parser')
    psoup = removeHTMLTaggedContent(
        psoup,
        [
        'ref',
        'note',
        'gallery', # galleries of images
        'imagemap'
        ])
    ptext = str(psoup)
    # Note: there are some articles with errors in the wikitext. 
    # For example, article 11049 (FIFA) has a note without ending (<!-- but no -->)
    # These cases have been handled MANUALLY

    # if see:
    #     print(ptext)

    # Second, we handle the templates
    ptext = ptext.replace('{{', '<xmltemplate>')
    ptext = ptext.replace('}}', '</xmltemplate>')
    ptext = ptext.replace('{|', '<xmltemplate>')
    ptext = ptext.replace('|}', '</xmltemplate>')

    ptext = ptext.replace('[[', '<internallink>')
    ptext = ptext.replace(']]', '</internallink>')
    # update_h_tags()


    # soup = BeautifulSoup(ptext, 'html.parser')
    psoup = BeautifulSoup(ptext, 'html.parser')
    # psoup = replace_foreign_language(psoup)
    #From here, only pbsoup is updated
    psoup, _, _ = updateInternalLinks(psoup)
    # print(self.ptext[:5000])
    # print("------------------------")
    # print("------------------------")
    # print("------------------------")
    # print("------------------------")
    # print(self.psoup)
    psoup = removeHTMLTaggedContent(
        psoup,
        [
        'xmltemplate'
        ])

    abstract = psoup.get_text()
    #let's remove URLs
    abstract = re.sub('https{,1}://.*? ', ' ', abstract, flags=re.DOTALL)
    #let's remove all characters except letters, digits, and whitespaces
    abstract = re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",abstract) # I keep numbers, letters (accentuated letters included)
    abstract = ' '.join(abstract.split())
    abstract = abstract.strip().replace('\n', ' ').replace('\t', ' ')

    # IF ABSTRACT IS EMPTY, TRY TO SPLIT THE WIKITEXT IN 2
    # OR APPLY A SPECIFIC TREATMENT
    if abstract.strip() == '' and first_try:
        # 1st step: specific treatment for some articles
        # wikitext_mod = wikitext
        # wikitext_mod = wikitext_mod.replace()

        match = re.compile(r"\n\n[^|]*?'''", re.DOTALL)
        a = re.search(match, wikitext)
        if not a == None:
            a = a.group(0)
            wikitext_cut = wikitext.split(a, 1)[-1]
            # parts = [process_abstract(w, False) for w in wikitext_cut]
            # abstract = ' '.join(parts)
            # abstract = wikitextparser.parse(wikitext).plain_text()
            abstract = process_abstract(wikitext_cut, first_try = False)
            print(abstract)

    return abstract
        

counter = 0
for file in files:
    id = file.split('/')[-1].split('.')[0]

    f = open(file, 'r', encoding='utf8')
    wikitext = f.read()
    f.close()

    # Find highest level headers (h2 to h5)
    headers = get_highest_level_headers(wikitext, 2)
    headers_str = [wikitextparser.remove_markup(h.title) for h in headers]
    outfile_headers.write(f'{id}\t{" §§ ".join(headers_str)}\n')

    # Find the internal links
    links, _ = wikilinks_namespace0(wikitext)
    links = list(set(links))
    links = [l.strip() for l in links if not l == '']
    outfile_links.write(f'{id}\t{" §§ ".join(links)}\n')
    

    # # SLOW WAY TO EXTRACT THE HEADING PART
    # wikicode = mwparserfromhell.parse(wikitext)
    # raw_abstract = str(wikicode.get_sections(include_lead=True, include_headings=False)[0])
    # # print(raw_abstract)


    # # Abstract extraction
    # try:
    #     # # Split text at the first header
    #     # size = max(100, len(str(headers[0])))
    #     # first_header = str(headers[0])[:size]
    #     # raw_abstract =  wikitext.split(first_header)[0]
    #     first_header = str(headers[0])
    #     raw_abstract =  wikitext.split(first_header)[0]
    
    # except:
    #     # Keep the raw text if no header
    #     raw_abstract = wikitext
    #     raw_abstract_kept.append(id)
    #     print(id)

    # clean_abstract = process_abstract(raw_abstract)
    # outfile_abstracts.write(f'{id}\t{clean_abstract}\n')

    counter += 1
    if counter%1000 == 0:
        print(f"{counter} articles treated")

outfile_headers.close()
outfile_links.close()
# outfile_abstracts.close()


1000 articles treated
2000 articles treated
3000 articles treated
4000 articles treated
5000 articles treated
6000 articles treated
7000 articles treated
8000 articles treated
9000 articles treated
10000 articles treated
11000 articles treated
12000 articles treated
13000 articles treated
14000 articles treated
15000 articles treated
16000 articles treated
17000 articles treated
18000 articles treated
19000 articles treated
20000 articles treated
21000 articles treated
22000 articles treated
23000 articles treated
24000 articles treated
25000 articles treated
26000 articles treated
27000 articles treated
28000 articles treated
29000 articles treated
30000 articles treated
31000 articles treated
32000 articles treated
33000 articles treated
34000 articles treated
35000 articles treated
36000 articles treated
37000 articles treated
38000 articles treated
39000 articles treated
40000 articles treated
41000 articles treated
42000 articles treated
43000 articles treated
44000 articles treat

Here we'll filter articles based on abstracts and more


Output exceeds the size limit. Open the full output data in a text editor
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles1.xml-p1p41242.bz2
1000 articles found
2000 articles found
3000 articles found
4000 articles found
5000 articles found
6000 articles found
7000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles10.xml-p4045403p5399366.bz2
8000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p5399367p6899366.bz2
9000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p6899367p7054859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p7054860p8554859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p8554860p9172788.bz2
10000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p10672789p11659682.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p9172789p10672788.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles14.xml-p11659683p13159682.bz2
11000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles14.xml-p13159683p14324602.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles15.xml-p14324603p15824602.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles15.xml-p15824603p17324602.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles15.xml-p17324603p17460152.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles16.xml-p17460153p18960152.bz2
...
47000 articles found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles9.xml-p2936261p4045402.bz2
48000 articles found
48512


Number of articles in hierarchy: 48649
Number of articles recovered: 48596
Number of duplicates in the hierarchy: 53













Output exceeds the size limit. Open the full output data in a text editor
10630377
1000 articles treated
FIFA''' ( ; ; Spanish: ''Federación Internacional de Fútbol Asociación''; German: ''Internationaler Verband des Association-Fußball''; Russian: ''Международная федерация футбола''; Arabic: ''الاتحاد الدولي لكرة القدم'') is a non-profit organization that describes itself as an international governing body of association football, futsal and beach soccer. It is the highest governing body of association football. FIFA was founded in 1904 to oversee international competition among the national associations of Belgium, Denmark, France, Germany, the Netherlands, Spain, Sweden and Switzerland. Headquartered in Zürich, Switzerland, its membership now comprises 211 national associations; Russia was suspended in 2022. These national associations must each also be members of one of the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America and the Caribbean, Oceania and South America. Today, FIFA outlines a number of objectives in the organizational Statutes, including growing association football internationally, providing efforts to ensure it is accessible to everyone, and advocating for integrity and fair play. FIFA is responsible for the organization and promotion of association football's major international tournaments, notably the World Cup which commenced in 1930 and the Women's World Cup which commenced in 1991. Although FIFA does not solely set the laws of the game, that being the responsibility of the International Football Association Board of which FIFA is a member, it applies and enforces the rules across all FIFA competitions. All FIFA tournaments generate revenue from sponsorship; in 2018, FIFA had revenues of over US $4.6 billion, ending the 2015–2018 cycle with a net positive of US$1.2 billion, and had cash reserves of over US$2.7 billion. Reports by investigative journalists have linked FIFA leadership with corruption, bribery, and vote-rigging related to the election of FIFA president Sepp Blatter and the organization's decision to award the 2018 and 2022 World Cups to Russia and Qatar, respectively. These allegations led to the indictments of nine high-ranking FIFA officials and five corporate executives by the U.S. Department of Justice on charges including racketeering, wire fraud, and money laundering. On 27 May 2015, several of these officials were arrested by Swiss authorities, who were launching a simultaneous but separate criminal investigation into how the organization awarded the 2018 and 2022 World Cups. Those among these officials who were also indicted in the U.S. are expected to be extradited to face charges there as well. Many officials were suspended by FIFA's ethics committee including Sepp Blatter and Michel Platini. In early 2017, reports became public about FIFA president Gianni Infantino attempting to prevent the re-elections of both chairmen of the ethics committee, Cornel Borbély and Hans-Joachim Eckert, during the FIFA congress in May 2017. On 9 May 2017, following Infantino's proposal, FIFA Council decided not to renew the mandates of Borbély and Eckert. Together with the chairmen, 11 of 13 committee members were removed.
chi-squared distribution''' (also '''chi-square''' or ) with degrees of freedom is the distribution of a sum of the squares of independent standard normal random variables. The chi-squared distribution is a special case of the gamma distribution and is one of the most widely used probability distributions in inferential statistics, notably in hypothesis testing and in construction of confidence intervals. This distribution is sometimes called the '''central chi-squared distribution''', a special case of the more general noncentral chi-squared distribution. The chi-squared distribution is used in the common chi-squared tests for goodness of fit of an observed distribution to a theoretical one, the independence of two criteria of classification of qualitative data, and in confidence interval estimation for a population standard deviation of a normal distribution from a sample standard deviation. Many other statistical tests also use this distribution, such as Friedman's analysis of variance by ranks.
2000 articles treated
3000 articles treated
4000 articles treated
5000 articles treated
Abū Muḥammad ʿAlī ibn Aḥmad ibn Saʿīd ibn Ḥazm''' (; also sometimes known as al-Andalusī aẓ-Ẓāhirī; 7 November 994 – 15 August 1064 [456 AH]) was an Andalusian Muslim polymath, historian, jurist, philosopher, and theologian, born in the Caliphate of Córdoba, present-day Spain. Described as one of the strictest hadith interpreters, Ibn Hazm was a leading proponent and codifier of the Zahiri school of Islamic thought and produced a reported 400 works, of which only 40 still survive. In all, his written works amounted to some 80 000 pages. Described as one of the fathers of comparative religion, the ''Encyclopaedia of Islam'' refers to him as having been one of the leading thinkers of the Muslim world.
6000 articles treated
7000 articles treated
8000 articles treated
Alopecia areata''', also known as '''spot baldness''', is a condition in which hair is lost from some or all areas of the body. Often, it results in a few bald spots on the scalp, each about the size of a coin. Psychological stress and illness are possible factors in bringing on alopecia areata in individuals at risk, but in most cases there is no obvious trigger. People are generally otherwise healthy. In a few cases, all the hair on the scalp is lost (''alopecia totalis''), or all body hair is lost (''alopecia universalis''), and loss can be permanent. It is distinctive from pattern hair loss, which is common among males. Alopecia areata is believed to be an autoimmune disease resulting from a breach in the immune privilege of the hair follicles. Risk factors include a family history of the condition. Among identical twins, if one is affected, the other has about a 50% chance of also being affected. The underlying mechanism involves failure by the body to recognize its own cells, with subsequent immune-mediated destruction of the hair follicle. No cure for the condition is known. Efforts may be used to try to speed hair regrowth, such as cortisone injections. Sunscreen, head coverings to protect from cold and sun, and glasses, if the eyelashes are missing, are recommended. In some cases, the hair regrows, and the condition does not reoccur. In others, hair loss and regrowth occurs over years. Among those in whom all body hair is lost, fewer than 10% recover. About 0.15% of people are affected at any one time, and 2% of people are affected at some point in time. Onset is usually in childhood. Males and females have the condition in equal numbers. The condition does not affect a person's life expectancy.
9000 articles treated
1714180
10000 articles treated
11000 articles treated
12000 articles treated
197245
13000 articles treated
2011918
20412550
20412640
14000 articles treated
2053318
...
46000 articles treated
Water vapor''', '''water vapour''' or '''aqueous vapor''' is the gaseous phase of water. It is one state of water within the hydrosphere. Water vapor can be produced from the evaporation or boiling of liquid water or from the sublimation of ice. Water vapor is transparent, like most constituents of the atmosphere. Under typical atmospheric conditions, water vapor is continuously generated by evaporation and removed by condensation. It is less dense than most of the other constituents of air and triggers convection currents that can lead to clouds. Being a component of Earth's hydrosphere and hydrologic cycle, it is particularly abundant in Earth's atmosphere, where it acts as a greenhouse gas and warming feedback, contributing more to total greenhouse effect than non-condensable gases such as carbon dioxide and methane. Use of water vapor, as steam, has been important for cooking, and as a major component in energy production and transport systems since the industrial revolution. Water vapor is a relatively common atmospheric constituent, present even in the solar atmosphere as well as every planet in the Solar System and many astronomical objects including natural satellites, comets and even large asteroids. Likewise the detection of extrasolar water vapor would indicate a similar distribution in other planetary systems. Water vapor is significant in that it can be indirect evidence supporting the presence of extraterrestrial liquid water in the case of some planetary mass objects.
47000 articles treated
48000 articles treated
48512 articles
0 articles with the same name
Abstracts:
* Average abstract size: 1365.9486518799472 (1054.3838368074944)
* 12 empty abstracts
* 26 (0.05%) short abstracts (less than 50 characters, can be considered empty):
  ['Crate', 'Chi-squared distribution', 'Sono Osato', 'Lois Mailou Jones', 'Timeline of zoology', 'Alopecia areata', 'History of Hesse', 'Ahmed Hassan al-Bakr', 'Kim Young-ha', 'Law of Austria', 'Arthropathy', 'History of Saint Vincent and the Grenadines', 'History of the jet engine', 'Kim Dong-in', 'Lists of legislation', 'History of Vanuatu', 'West Bengal', 'Road junction', 'Timeline of railway history', 'History of Benin', 'Belgrade', 'Timeline of geology', 'Law of Denmark', 'Districts of Suriname', 'Planter', 'Water vapor']

Headers:
* 20 articles with no header found:
  ['Stratigraphic section', 'Controller (computing)', 'South East Point', 'Minoo Island', 'Elbazduko Britayev', 'Seka Gadiyev', 'Robert Guérin', 'Glass lizard', 'Authentication (law)', 'Muhammad Imaaduddeen IV', 'Recrystallization (geology)', 'Tayabas Isthmus', 'Trade journalism', 'Distribution function (physics)', 'Chemical law', 'Climate change and agriculture', 'Mark (unit)', 'North–South divide', 'Hum Log', 'Egyptian law']

There are 0 canonical names that startswith a lowercase letter
4 pages with no outgoing edge
2297586 directed edges found
11 classes level 0: [1126, 1408, 2422, 2979, 3148, 3310, 4255, 4290, 4681, 5318, 15575] (48512)
32 classes level 1: [355, 360, 500, 608, 791, 849, 886, 886, 988, 1012, 1108, 1126, 1186, 1191, 1207, 1210, 1231, 1335, 1386, 1408, 1825, 1902, 2030, 2075, 2120, 2310, 2342, 2396, 2452, 2979, 3148, 3310] (48512)
251 classes level 2: [1, 2, 3, 3, 3, 5, 5, 5, 6, 8, 8, 9, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15, 15, 17, 17, 17, 17, 18, 18, 19, 20, 20, 20, 22, 23, 25, 25, 25, 29, 30, 30, 35, 35, 35, 35, 36, 36, 36, 37, 39, 39, 39, 40, 40, 40, 40, 40, 40, 41, 42, 42, 43, 43, 43, 44, 45, 45, 47, 47, 48, 48, 49, 50, 50, 51, 51, 51, 51, 52, 52, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 60, 60, 60, 60, 61, 62, 63, 68, 69, 69, 69, 70, 71, 72, 72, 74, 74, 78, 78, 78, 79, 81, 84, 85, 85, 86, 88, 89, 89, 89, 89, 90, 91, 92, 93, 93, 94, 97, 104, 105, 109, 110, 110, 110, 111, 111, 114, 116, 117, 120, 121, 121, 122, 128, 129, 131, 132, 134, 136, 138, 139, 139, 140, 140, 147, 149, 151, 152, 160, 160, 167, 170, 172, 172, 175, 177, 185, 192, 194, 194, 196, 197, 198, 199, 200, 201, 201, 204, 219, 220, 221, 221, 226, 226, 230, 234, 237, 238, 241, 249, 250, 250, 267, 270, 271, 273, 275, 280, 295, 297, 298, 299, 300, 301, 302, 302, 303, 323, 338, 353, 357, 365, 369, 378, 382, 383, 387, 395, 411, 420, 440, 441, 451, 470, 490, 500, 501, 503, 518, 529, 539, 542, 546, 563, 567, 594, 629, 654, 668, 741, 803, 807, 825, 865, 873, 931, 989, 1146, 1424, 1503, 1725] (48512)
Classes (level 2) with less than 3 nodes: ['People ->- Miscellaneous ->- Micronations', 'Physical sciences ->- Earth science ->- Earth science basics']
Nodes in classes that have less than 3 nodes: ['31749258', '2890783', '20653168']
Wikipedia:Vital articles/Level/5/People/Artists, musicians, and composers:{'class0': 'People', 'class1': 'Artists, musicians, and composers'}
Output exceeds the size limit. Open the full output data in a text editor
Wikipedia:Vital articles/Level/5/People/Writers and journalists
Wikipedia:Vital articles/Level/5/People/Artists, musicians, and composers
Wikipedia:Vital articles/Level/5/People/Entertainers, directors, producers, and screenwriters
Wikipedia:Vital articles/Level/5/People/Philosophers, historians, political and social scientists
Wikipedia:Vital articles/Level/5/People/Religious figures
Wikipedia:Vital articles/Level/5/People/Politicians and leaders
Wikipedia:Vital articles/Level/5/People/Military personnel, revolutionaries, and activists
Wikipedia:Vital articles/Level/5/People/Scientists, inventors, and mathematicians
Wikipedia:Vital articles/Level/5/People/Sports figures
Wikipedia:Vital articles/Level/5/People/Miscellaneous
Wikipedia:Vital articles/Level/5/History
Wikipedia:Vital articles/Level/5/Geography/Physical
Wikipedia:Vital articles/Level/5/Geography/Countries
Wikipedia:Vital articles/Level/5/Geography/Cities
Wikipedia:Vital articles/Level/5/Arts
Wikipedia:Vital articles/Level/5/Philosophy and religion
Wikipedia:Vital articles/Level/5/Everyday life
Wikipedia:Vital articles/Level/5/Everyday life/Sports, games and recreation
Wikipedia:Vital articles/Level/5/Society and social sciences/Social studies
Wikipedia:Vital articles/Level/5/Society and social sciences/Politic and economic
Wikipedia:Vital articles/Level/5/Society and social sciences/Culture
Wikipedia:Vital articles/Level/5/Biological and health sciences/Biology
Wikipedia:Vital articles/Level/5/Biological and health sciences/Animals
Wikipedia:Vital articles/Level/5/Biological and health sciences/Plants
Wikipedia:Vital articles/Level/5/Biological and health sciences/Health
...


32 Vital articles (level 5) found
Wiki IDs have been stored
55702953
13086 features calculated for abstracts
1958 features calculated for headers

In [6]:
outdir_wikitexts = './outputs/wikivitals_raw_wikitexts/'

raw_abstract_kept = [] # store the list of articles the complete wikitext has been kept as an abstract


# Manual treatment for some files
ids_to_handle_manually = ['209172', '21345189', '5276276', '55904', '6793009']
# files = [outdir_wikitexts + str(i) + '.wt' for i in ids_to_handle_manually]
str_to_remove = {
    '209172': ('{{efn|{{lang-ar|أحمد حسن البك}}', ' '), # problem with two '{{' but only one '}}' (parser failed)
    '21345189': ("\n'''Vladivostok'''","\n\n'''Vladivostok'''"),
    '5276276': ("}}\nThe '''Salish Sea'''", "}}\n\nThe '''Salish Sea'''"), # problem with a comment, no \n\n before the first paragraph 
    '55904': ('''({{IPAc-en|b|E|l|"|g|r|eI|d}} {{respell|bel|GRAYD}}, {{IPAc-en|'|b|ɛ|l|ɡ|ɹ|eɪ|d}} {{respell|BEL|grayd}};{{NoteTag|{{small|also}} {{IPAc-en|US|b|E|l|"|g|r|A:|d|,_|-|"|g|r|{|d}} {{respell|bel|GRAHD|,_|-|GRAD}}, {{IPAc-en|"|b|E|l|g|r|A:|d|,_|-|g|r|{|d}} {{respell|BEL|grahd|,_|-|grad}}<ref>{{Cite book|title=Collins English Dictionary|publisher=HarperCollins|year=2018|isbn=0-008-28437-7|edition=13th|chapter=Belgrade}}</ref><ref>{{Cite web|title=Definition of Belgrade {{!}} Dictionary.com|url=https://www.dictionary.com/browse/Belgrade|access-date=2022-02-14|website=www.dictionary.com|language=en}}</ref><!-- 'Collins English Dictionary,' being British, does not include the pronunciation variants given in this note. -->}} {{lang-sr|Београд / Beograd|lit=White City}}, {{IPA-sh|beǒɡrad|pron|Sr-beograd-native.ogg}}; [[Names of European cities in different languages: B|names in other languages]])''', ' '), # problem with two '{{' but only one '}}' (parser failed)
    '6793009': ('<!-- See Template:Infobox settlement for additional fields and descriptions --', '<!-- See Template:Infobox settlement for additional fields and descriptions -->') # Comment not terminated properly
}

store_content = dict()

for i in ids_to_handle_manually:
    file = outdir_wikitexts + i + '.wt'
    id = file.split('/')[-1].split('.')[0]

    f = open(file, 'r', encoding='utf8')
    wikitext = f.read()
    f.close()

    # Find highest level headers (h2 to h5)
    headers = get_highest_level_headers(wikitext, 2)

    # Abstract extraction
    try:
        # Split text at the first header
        first_header = str(headers[0])
        raw_abstract =  wikitext.split(first_header)[0]
    except:
        # Keep the raw text if no header
        raw_abstract = wikitext
        raw_abstract_kept.append(id)
        print(id)

    j,k = str_to_remove[i]
    raw_abstract_ = raw_abstract.replace(j, k)


    clean_abstract = process_abstract(raw_abstract_)
    print(clean_abstract)
    print('\n')
    store_content[i] = clean_abstract
    # outfile_abstracts.write(f'{id}\t{clean_abstract}\n')

outfile_abstracts = open('./outputs/__abstracts.txt', 'r', encoding='utf8')
lines = outfile_abstracts.readlines()
outfile_abstracts.close()

outfile_abstracts = open('./outputs/__abstracts.txt', 'w', encoding='utf8')
for l in lines:
    id = l.split('\t')[0]
    if id in ids_to_handle_manually:
        l_ = f'{id}\t{store_content[id]}\n'
        outfile_abstracts.write(l_)
    else:
        outfile_abstracts.write(l)
outfile_abstracts.close()

Ahmed Hassan al Bakr 1 July 1914 4 October 1982 was the fourth president of Iraq from 17 July 1968 to 16 July 1979. He was a leading member of the revolutionary Arab Socialist Ba ath Party and later the Baghdad based Ba ath Party and its regional organisation Ba ath Party Iraq Region the Ba ath Party s Iraqi branch which espoused Ba athism a mix of Arab nationalism and Arab socialism. Al Bakr first rose to prominence after the 14 July Revolution which overthrew the monarchy. In the newly established government he was involved in improving Iraqi Soviet relations. In 1959 al Bakr was forced to resign from the Iraqi military the then Iraqi government accused him of anti government activities. Following his forced retirement he became the chairman of the Ba ath Party s Iraqi branch s Military Bureau. Through this office he recruited members to the Ba athist cause through patronage and cronyism. Prime Minister Abd al Karim Qasim was overthrown in the Ramadan 8 February Revolution al Bakr wa

In [23]:
outfile_abstracts = open('./outputs/__abstracts.txt', 'r', encoding='utf8')
lines = outfile_abstracts.readlines()
outfile_abstracts.close()

outfile_abstracts = open('./outputs/__abstracts.txt', 'w', encoding='utf8')
for l in lines:
    id = l.split('\t')[0]
    try:
        abstract = l.split('\t')[1]
        abstract = re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",abstract) # I keep numbers, letters (accentuated letters included
        abstract = ' '.join(abstract.split())
        l_ = f'{id}\t{abstract}\n'
    except:
        l_ = f'{id}\t\n'
    outfile_abstracts.write(l_)
outfile_abstracts.close()

# abstract = re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",abstract) # I keep numbers, letters (accentuated letters included
# abstract = ' '.join(abstract.split())

In [9]:
import statistics
from collections import Counter
import shutil

# Let's use what we saved
# 
abstracts_file = './outputs_save/__abstracts.txt'
# abstracts_file = './outputs/__abstracts.txt'
links_file = './outputs_save/__links.txt'
headers_file = './outputs_save/__headers.txt'
wikiid_title_file = './outputs_save/wikiid_title.txt'
labels_file = './outputs_save/classification_unfiltered.txt'

# 0) Let's filter the links (we'll keep only the links in the set of abstracts found)
wikiid_title_dict = dict()
with open(wikiid_title_file, 'r', encoding='utf8') as wikiid_title:
    for l in wikiid_title:
        wikiid, title = l.strip().split('\t')[0], l.strip().split('\t')[1]
        wikiid_title_dict[wikiid] = title

wikiid_title_dict_filtered = dict()
with open(abstracts_file, 'r', encoding='utf8') as abstracts:
    for l in abstracts:
        try:
            id = l.strip().split('\t')[0]
        except:
            id = l.strip() 
        # if not id in ['31749258', '2890783', '20653168']: #Nodes that belongs to class with less than 3 elements (at level 2)
        wikiid_title_dict_filtered[id] = wikiid_title_dict[id]
print(f'{len(list(wikiid_title_dict_filtered.keys()))} articles')
set_of_titles = set([i for i in wikiid_title_dict_filtered.values()])
print(f'{len(list(wikiid_title_dict_filtered.keys()))-len(set_of_titles)} articles with the same name')

# seen = set()
# duplicates = [x.lower() for x in wikiid_title_dict_filtered.values() if x.lower() in seen or seen.add(x.lower())]
# print(duplicates)
    
title_wikiid = dict()
for k, v in wikiid_title_dict_filtered.items():
    title_wikiid[v] = k  


# 1) Let's check some infos
with open(abstracts_file, 'r', encoding='utf8') as abstracts:
    abstract_lengths = []
    empty_abstracts = []
    short_abstracts = [] # less than 50 chars
    for l in abstracts:
        try:
            length = len(l.strip().split('\t')[1])
            abstract_lengths.append(length)
            if length < 50:
                short_abstracts.append(l.strip().split('\t')[0])
        except:
            empty_abstracts.append(l.strip())
            short_abstracts.append(l.strip())
            abstract_lengths.append(0)
    print(f'Abstracts:')
    print(f'* Average abstract size: {statistics.mean(abstract_lengths)} ({statistics.stdev(abstract_lengths)})')
    print(f'* {len(empty_abstracts)} empty abstracts')
    print(f'* {len(short_abstracts)} ({100*len(short_abstracts)/len(abstract_lengths):.2f}%) short abstracts (less than 50 characters, can be considered empty):\n  {[(wikiid_title_dict_filtered[v],v) for v in short_abstracts]}\n')

    for v in short_abstracts:
        shutil.copy2(f'./outputs/wikivitals_raw_wikitexts/{v}.wt', './outputs/wikivitals_manual_changes/') # target filename is /dst/dir/file.ext

# 2) Number of articles with no headers
with open(headers_file, 'r', encoding='utf8') as headers:
    no_header = []
    for l in headers:
        try:
            id, headers_found = l.strip().split('\t')
            headers_found = [a.strip() for a in headers_found.split(' §§ ')]
        except:
            no_header.append(l.strip())
    print(f'Headers:')
    print(f'* {len(no_header)} articles with no header found:\n  {[wikiid_title_dict_filtered[v] for v in no_header]}\n')

# 3) Get the links filtered
outfile_links_filtered = open('./outputs/__links_filtered.txt', 'w', encoding = 'utf8')
page_without_link = 0
number_edges = 0

# First, let's check the number of canonical name that starts with a lower letter 
tmp = [i for i in set_of_titles if i[0].islower()]
print(f"There are {len(tmp)} canonical names that startswith a lowercase letter")
with open(links_file, 'r', encoding = 'utf8') as links:
    for l in links:
        try:
            id, links_found = l.strip().split('\t')
            if id == '18562':
                print(links_found)
                print('-------------')
            links_found = [a.strip() for a in links_found.split(' §§ ')]
            links_found = [a.replace(a[0], a[0].upper(), 1) for a in links_found]
            if id == '18562':
                print(links_found)
            links_found_filtered = set_of_titles.intersection(set(links_found))
            number_edges += len(links_found_filtered)
            for wikiid in [title_wikiid[t] for t in links_found_filtered]:
                outfile_links_filtered.write(f'{id}\t{wikiid}\t1.0\n')
        except:
            id = l.strip() # case where there is no link in the page
            links_found = []
            page_without_link += 1
            links_found_filtered = set()
            outfile_links_filtered.write(f'{id}\n')
print(f'{page_without_link} pages with no outgoing edge')
print(f'{number_edges} directed edges found')
outfile_links_filtered.close()

# 4) Calculate the classes
outfile_classes_0 = open('./outputs/__classes0.txt', 'w', encoding = 'utf8')
outfile_classes_1 = open('./outputs/__classes1.txt', 'w', encoding = 'utf8')
outfile_classes_2 = open('./outputs/__classes2.txt', 'w', encoding = 'utf8')
with open(labels_file, 'r', encoding='utf8') as labels:
    set_class0, set_class1, set_class2 = [], [], []
    ids = []
    for l in labels:
        try:
            name, class0, class1, class2 = l.strip().split('\t')
            name = name.strip()
            id = title_wikiid[name] # fails if id not in the keys
            ids.append(id)
            set_class0.append(class0)
            set_class1.append(class0 + ' ->- ' + class1)
            set_class2.append(class0 + ' ->- ' + class1 + ' ->- ' + class2)
            outfile_classes_0.write(id + '\t' + class0 + '\n')
            outfile_classes_1.write(id + '\t' + class0 + ' ->- ' + class1 + '\n')
            outfile_classes_2.write(id + '\t' + class0 + ' ->- ' + class1 + ' ->- ' + class2 + '\n')
        except:
            None   
outfile_classes_0.close()
outfile_classes_1.close()
outfile_classes_2.close()
# print(set(ids).symmetric_difference(set(title_wikiid.values())))      
print(f'{len(set(set_class0))} classes level 0: {sorted(Counter(set_class0).values())} ({sum(Counter(set_class0).values())})')
print(f'{len(set(set_class1))} classes level 1: {sorted(Counter(set_class1).values())} ({sum(Counter(set_class1).values())})')
print(f'{len(set(set_class2))} classes level 2: {sorted(Counter(set_class2).values())} ({sum(Counter(set_class2).values())})')   
        
tmp = Counter(set_class2)
less_3 = [i for i in tmp.keys() if tmp[i]<3]
print(f'Classes (level 2) with less than 3 nodes: {less_3}')
indices_nodes_in_less_3 = [i for i in range(len(ids)) if set_class2[i] in less_3]
nodes_in_less_3 = [ids[i] for i in indices_nodes_in_less_3]
print(f'Nodes in classes that have less than 3 nodes: {nodes_in_less_3}')
        
        
  

48512 articles
0 articles with the same name
Abstracts:
* Average abstract size: 1253.1711123021107 (956.4818379403986)
* 8 empty abstracts
* 26 (0.05%) short abstracts (less than 50 characters, can be considered empty):
  [('Crate', '1010583'), ('Sono Osato', '1253459'), ('Lois Mailou Jones', '1475461'), ('Timeline of zoology', '15301001'), ('Akaji Maro', '16815417'), ('History of Hesse', '182064'), ('List of fashion magazines', '20102549'), ('Kim Young-ha', '22075910'), ('Law of Austria', '25135167'), ('Arthropathy', '2571116'), ('History of Saint Vincent and the Grenadines', '27229'), ('History of the jet engine', '27888245'), ('Kim Dong-in', '28511848'), ('Lists of legislation', '32102836'), ('History of Vanuatu', '32444'), ('Isawa Shūji', '34531593'), ('Nógrád County', '349050'), ('Road junction', '3720055'), ('Law of Romania', '39338227'), ('Timeline of railway history', '398236'), ('History of Benin', '42386'), ('Timeline of materials technology', '58742'), ('Timeline of geology

Here we'll stem the abstracts

In [8]:
import nltk.stem
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
import numpy as np



english_stemmer = nltk.stem.SnowballStemmer('english', ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])

vectorizer = StemmedCountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=0.001)

tfidf_vectorizer = TfidfTransformer()

In [9]:
abstracts_file = './outputs_save/__abstracts.txt'
outfile_abstracts_vocabulary = open('./outputs/__abstracts_vocabulary.txt', 'w', encoding = 'utf8')
outfile_abstracts_countvec = open('./outputs/__abstracts_countvec.txt', 'w', encoding = 'utf8')
outfile_abstracts_tfidf = open('./outputs/__abstracts_tfidf.txt', 'w', encoding = 'utf8')

with open(abstracts_file, 'r', encoding = 'utf8') as abstracts:
    corpus = []
    ids = []
    stop_cnt = 0
    for l in abstracts:
        try:
            id, abstract = l.strip().split('\t')
            abstract.replace('.', ' ')
            corpus.append(abstract)
        except:
            id = l.strip()
            corpus.append('')
        ids.append(id)
        stop_cnt += 1


X = vectorizer.fit_transform(corpus)
X_tf = tfidf_vectorizer.fit_transform(X)
feature_names = np.array(vectorizer.get_feature_names_out())
nzX_rows, nzX_cols = np.nonzero(X)
feats_dict = dict()
for i in range(len(ids)):
    feats_dict[i] = []
for a, b in zip(nzX_rows, nzX_cols):
    feats_dict[a].append((feature_names[b], X[a,b], X_tf[a,b]))

for i in range(len(ids)):
    str2write = ids[i] + '\t' + ' '.join(f'abs_{a.replace(" ", "_")}:{b}' for a,b,_ in feats_dict[i]) + '\n'
    outfile_abstracts_countvec.write(str2write)
    str2write = ids[i] + '\t' + ' '.join(f'abs_{a.replace(" ", "_")}:{b}' for a,_,b in feats_dict[i]) + '\n'
    outfile_abstracts_tfidf.write(str2write)
outfile_abstracts_vocabulary.write('\t'.join([f'abs_{a.replace(" ", "_")}' for a in feature_names]))
    


outfile_abstracts_vocabulary.close()
outfile_abstracts_countvec.close()
outfile_abstracts_tfidf.close()

print(f'{len(feature_names)} features calculated for abstracts')

11939 features calculated for abstracts


In [10]:
import re

headers_file = './outputs_save/__headers.txt'
outfile_headers_vocabulary = open('./outputs/__headers_vocabulary.txt', 'w', encoding = 'utf8')
outfile_headers_countvec = open('./outputs/__headers_countvec.txt', 'w', encoding = 'utf8')
outfile_headers_tfidf = open('./outputs/__headers_tfidf.txt', 'w', encoding = 'utf8')

with open(headers_file, 'r', encoding = 'utf8') as headers:
    corpus = []
    global_corpus = []
    ids = []
    stop_cnt = 0
    for l in headers:
        try:
            id, header = l.strip().split('\t')
            tmp = [re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",i) for i in header.split('§§')]
            tmp = [' '.join(i.split()) for i in tmp]
            global_corpus = global_corpus + tmp
            # header = re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",header) # I keep numbers, letters (accentuated letters included)
            header = ' '.join(tmp)
            corpus.append(header)
        except:
            id = l.strip()
            corpus.append('')
        ids.append(id)
        stop_cnt += 1

# print(corpus)
vectorizer.fit(global_corpus)
# vocab = vectorizer.vocabulary_
X = vectorizer.transform(corpus)
X_tf = tfidf_vectorizer.fit_transform(X)
feature_names = np.array(vectorizer.get_feature_names_out())
nzX_rows, nzX_cols = np.nonzero(X)
feats_dict = dict()
for i in range(len(ids)):
    feats_dict[i] = []
for a, b in zip(nzX_rows, nzX_cols):
    feats_dict[a].append((feature_names[b], X[a,b], X_tf[a,b]))
for i in range(len(ids)):
    str2write = ids[i] + '\t' + ' '.join(f'hea_{a.replace(" ", "_")}:{b}' for a,b,_ in feats_dict[i]) + '\n'
    outfile_headers_countvec.write(str2write)
    str2write = ids[i] + '\t' + ' '.join(f'hea_{a.replace(" ", "_")}:{b}' for a,_,b in feats_dict[i]) + '\n'
    outfile_headers_tfidf.write(str2write)
outfile_headers_vocabulary.write('\t'.join([f'hea_{a.replace(" ", "_")}' for a in feature_names]))
    



outfile_headers_vocabulary.close()
outfile_headers_countvec.close()
outfile_headers_tfidf.close()

print(f'{len(feature_names)} features calculated for headers')

202 features calculated for headers


In [11]:
# abstracts_file = './outputs_save/__abstracts.txt'
wikiid_title_file = './outputs_save/wikiid_title.txt'

outfile_titles_vocabulary = open('./outputs/__titles_vocabulary.txt', 'w', encoding = 'utf8')
outfile_titles_countvec = open('./outputs/__titles_countvec.txt', 'w', encoding = 'utf8')
outfile_titles_tfidf = open('./outputs/__titles_tfidf.txt', 'w', encoding = 'utf8')
outfile_titles = open('./outputs/__titles.txt', 'w', encoding = 'utf8')

# WARNING: min_df changed for titles
vectorizer = StemmedCountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=0.0001)

# 0) Let's filter the titles (we'll keep only the links in the set of abstracts found)
wikiid_title_dict = dict()
with open(wikiid_title_file, 'r', encoding='utf8') as wikiid_title:
    for l in wikiid_title:
        wikiid, title = l.strip().split('\t')
        wikiid_title_dict[wikiid] = title

wikiid_title_dict_filtered = dict()
with open(abstracts_file, 'r', encoding='utf8') as abstracts:
    ids = []
    for l in abstracts:
        try:
            id = l.strip().split('\t')[0]
            ids.append(id)
        except:
            id = l.strip() 
            ids.append(id)
        wikiid_title_dict_filtered[id] = wikiid_title_dict[id]

corpus = [wikiid_title_dict_filtered[id] for id in ids]
corpus = [re.sub(r"[^a-zA-Z0-9\u00C0-\u00FF.\s]"," ",i) for i in corpus]
corpus = [' '.join(i.split()) for i in corpus]
for id in ids:
    outfile_titles.write(f'{id}\t{wikiid_title_dict_filtered[id]}\n')

X = vectorizer.fit_transform(corpus)
X_tf = tfidf_vectorizer.fit_transform(X)
feature_names = np.array(vectorizer.get_feature_names_out())
nzX_rows, nzX_cols = np.nonzero(X)
feats_dict = dict()
for i in range(len(ids)):
    feats_dict[i] = []
for a, b in zip(nzX_rows, nzX_cols):
    feats_dict[a].append((feature_names[b], X[a,b], X_tf[a,b]))
for i in range(len(ids)):
    str2write = ids[i] + '\t' + ' '.join(f'tit_{a.replace(" ", "_")}:{b}' for a,b,_ in feats_dict[i]) + '\n'
    outfile_titles_countvec.write(str2write)
    str2write = ids[i] + '\t' + ' '.join(f'tit_{a.replace(" ", "_")}:{b}' for a,_,b in feats_dict[i]) + '\n'
    outfile_titles_tfidf.write(str2write)
outfile_titles_vocabulary.write('\t'.join([f'tit_{a.replace(" ", "_")}' for a in feature_names]))
    



outfile_titles_vocabulary.close()
outfile_titles_countvec.close()
outfile_titles_tfidf.close()
outfile_titles.close()

print(f'{len(feature_names)} features calculated for titles')

3664 features calculated for titles


In [2]:
# CHI-2 selection of features
# Let's select 4000 most 'predictive' features for class level 3 

from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import coo_matrix
import numpy as np

def get_stems(list_of_vocabulary_files):
    stems_indices = dict()
    i = 0
    for file in list_of_vocabulary_files:
        with open(f'./outputs_save/{file}', 'r', encoding = 'utf8') as v_f:
            stems_full = v_f.readline().split('\t')
            for s in stems_full:
                stems_indices[s] = i
                i += 1
    return(stems_indices)

def get_article_indices(id_whatever_file):
    wikiids_indices = dict()
    i = 0
    with open(f'./outputs_save/{id_whatever_file}', 'r', encoding = 'utf8') as id_whatever:
        for line in id_whatever:
            id = line.strip().split('\t')[0]
            wikiids_indices[id] = i
            i+=1
    return wikiids_indices

def load_features(list_of_feature_files, list_of_vocabulary_files):
    stems_indices = get_stems(list_of_vocabulary_files)
    wikiids_indices = get_article_indices(list_of_feature_files[0])
    rows_, cols_, data_ = [], [], []
    for file in list_of_feature_files:
        with open(f'./outputs_save/{file}', 'r', encoding = 'utf8') as id_feat:
            for line in id_feat:
                try:
                    id = wikiids_indices[line.strip().split('\t')[0]] 
                except:
                    break
                try: 
                    features_str = line.strip().split('\t')[1]
                except:
                    features_str = ''
                cols = [stems_indices[stem_cnt.split(':')[0]] for stem_cnt in features_str.strip().split()]
                cols_ += cols
                rows_ += [id for _ in range(len(cols))]
                data_ += [1 for _ in range(len(cols))]
    num_nodes = len(wikiids_indices.keys())
    vocab_size = len(stems_indices.keys())
    tmp = sorted([(v,k) for k,v in stems_indices.items()])
    ordered_stems = [k for (_, k) in tmp]
    return coo_matrix((data_, (rows_, cols_)), shape=(num_nodes, vocab_size)), ordered_stems

def load_target(target_file, ref_file):
    wikiids_indices = get_article_indices(ref_file)
    targets = dict()
    y_out = [[None] for _ in range(len(wikiids_indices.keys()))]
    with open(f'./outputs_save/{target_file}', 'r', encoding = 'utf8') as id_target:
        for line in id_target:
            if not line.strip() == '':
                wikiid, target_str = line.strip().split('\t')
                id = wikiids_indices[wikiid] 
                targets[id] = target_str
    for id in targets.keys():
        y_out[id][0] = targets[id]
    return np.array(y_out)
    


X, feats = load_features([
        '__abstracts_countvec.txt',
        '__headers_countvec.txt' ,
        '__titles_countvec.txt'
    ], 
    [
        '__abstracts_vocabulary.txt',
        '__headers_vocabulary.txt',
        '__titles_vocabulary.txt'
    ])  

# X, feats = load_features([
#         '__headers_countvec.txt' ,
#         '__titles_countvec.txt'
#     ], 
#     [
#         '__headers_vocabulary.txt',
#         '__titles_vocabulary.txt'
#     ])    

y_target = load_target(
    '__classes2.txt',
    '__abstracts_countvec.txt'
)

# for i in y_target:
#     if i == [None]:
#         print(i)


ch2 = SelectKBest(chi2, k=4000)
X_new = ch2.fit_transform(X, y_target)
feats1 = ch2.get_feature_names_out(input_features=feats)



y_target = load_target(
    '__classes1.txt',
    '__abstracts_countvec.txt'
)

ch2 = SelectKBest(chi2, k=4000)
ch2.fit(X, y_target)
feats2 = ch2.get_feature_names_out(input_features=feats)






In [3]:
# Save the filtered vocabulary and the definitive features
feats_f = open('./outputs/__wikivitals_features_one_hot.txt', 'w', encoding='utf8')
vocab_f = open('./outputs/__wikivitals_vocabulary.txt', 'w', encoding='utf8')

# 1- Save the definitive vocabulary
vocab_f.write('\t'.join(feats1))
vocab_f.close()

# 2- Save the features (one-hot representations)
X_new_ = X_new.tocoo()
wikiid_indices = get_article_indices('__abstracts_countvec.txt') # wiki id : id (from 0 to (num. articles - 1))
sorted_wikiids = [k for v,k in sorted([(v_,k_) for k_, v_ in wikiid_indices.items()])]
wikiid_feats_couples = zip(
    [sorted_wikiids[i] for i in X_new_.row], 
    [feats1[i] for i in X_new_.col])
id_feats = dict()
for wikiid, feat in wikiid_feats_couples:
    id_feats[wikiid] = id_feats.get(wikiid, [])
    id_feats[wikiid].append(feat)

for k,v in id_feats.items():
    l = str(k) + '\t' + ' '.join([str(j)+':1' for j in v]) + '\n'
    feats_f.write(l)

feats_f.close()




In [33]:
# Display some statistics
import statistics

print('Average number of features per articles (with standard deviation):')
lengths = [len(v) for v in id_feats.values()]
m, std = statistics.mean(lengths), statistics.stdev(lengths)
print(f'{m:.2f} ({std:.2f})')

print('Number of features per source:')
prefixes = Counter([i[:3] for i in feats1])
print(prefixes)

Average number of features per articles (with standard deviation):
58.47 (34.61)
Number of features per source:
Counter({'abs': 3452, 'tit': 370, 'hea': 178})


# Get the templates (links at the end of an article)

In [76]:
# import mwxml
# import mwtypes
# import wikitextparser
# # Note: also install wikipedia_sections (pip install wikipedia_sections)

# import re

# # Internal path to the Wikipedia dump split
# wikidump_splits_path = "C:/Users/Antoine/Downloads/enwiki-split"
# dump_files = os.listdir(wikidump_splits_path)
# files = [wikidump_splits_path + '/' + f for f in dump_files]
# files = files



# # Set of all article canonical titles
# article_titles = set(list(articles_classification.keys()))



# def page_info(dump, path):
#     """
#     Read the dump page per page. Yield infos (id, title, redirect, and text)
#     only for template pages in the set of wikivitals articles (global var)
#     (can be paralellized, can't make it work)

#     Input: 
#     * dump: list of Wikidumps (with format .xml.bz2)
#     * path: str - path to Wikidump split   
#     """
#     for page in dump:
#         tstamp = mwtypes.Timestamp(0)
#         if page.title in article_titles and page.redirect == None and page.namespace == 10:
#             for revision in page:
#                 if revision.timestamp > tstamp:
#                     last_revision = revision
#                     tstamp = revision.timestamp
#                 text = last_revision.text
#             yield page.id, page.title, page.redirect, text


# # If some articles are redirections, we'll exclude them from our set of articles
# redirections = []
# raw_abstract_kept = []
# templates_content = dict()
# counter = 0

# # adjustment1 = re.compile(r'thumb\|.*?\n')
# adjustment2 = re.compile(r'\{\|.*\|\}', re.DOTALL)
# adjustment3 = re.compile(r'\[\[.*\]\]', re.DOTALL)
# # adjustment4 = re.compile(r'^.*\|.*?\n', re.DOTALL)

# for file in files:
#     print(file)
#     for id, title, redirect, wikitext in mwxml.map(page_info, [file]):
        
#         # Exclude redirections 
#         if not redirect == None:
#             redirections.append(title)
#         else:

#             # Find the internal links
#             links, _ = wikilinks_namespace0(wikitext)
            
  

#             # if (not title in articles_content.keys()) or (articles_content[title]['clean abstract'] == None):
#             templates_content[title] = {
#                 'id': id,
#                 'title': title,
#                 'links': set(links),
#                 'wikitext': wikitext.replace('\n', ' ')
#             }
#             counter +=1
#             if counter%1000 == 0:
#                 print(f'{counter} templates found')

# print(len(list(templates_content.keys())))

C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles1.xml-p1p41242.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles10.xml-p4045403p5399366.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p5399367p6899366.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles11.xml-p6899367p7054859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p7054860p8554859.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles12.xml-p8554860p9172788.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p10672789p11659682.bz2
1000 templates found
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles13.xml-p9172789p10672788.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles14.xml-p11659683p13159682.bz2
C:/Users/Antoine/Downloads/enwiki-split/enwiki-20220401-pages-articles14.xml-p13159683p14324602.bz2
C

Stats

In [2]:
from statistics import mean, stdev

links_file = './outputs_save/__links_filtered.txt'
class0_file = './outputs_save/__classes0.txt'
class1_file = './outputs_save/__classes1.txt'
class2_file = './outputs_save/__classes2.txt'

def file2dict(file):
    f = open(file, 'r', encoding='utf8')
    d = dict()
    for l in f:
        i, j = l.strip().split('\t')
        d[i] = j
    return d

with open(links_file, 'r', encoding='utf8') as links:
    set_of_links = set()
    cnt = 0
    for l in links:
        try:
            start, end, w = l.strip().split('\t')
            set_of_links.add((start, end))
            set_of_links.add((end, start))
            cnt += 1
        except:
            None
print(f'Directed graph: {cnt} nodes')
num_edges_undirected = len(set_of_links)
print(f'Undirected graph: {num_edges_undirected} nodes\n')


# Homophily metrics (in undirected graph)
class0_ = file2dict(class0_file)
class1_ = file2dict(class1_file)
class2_ = file2dict(class2_file)
H0 = sum([1 for a,b in set_of_links if class0_[a] == class0_[b]]) / num_edges_undirected
H1 = sum([1 for a,b in set_of_links if class1_[a] == class1_[b]]) / num_edges_undirected
H2 = sum([1 for a,b in set_of_links if class2_[a] == class2_[b]]) / num_edges_undirected
print(f'Edge homophily for class level 0: {H0:.2f}')
print(f'Edge homophily for class level 1: {H1:.2f}')
print(f'Edge homophily for class level 2: {H2:.2f}\n')

# Edge distribution
nodes_id = [k for k in class0_.keys()]
degrees = dict()
for id in nodes_id:
    degrees[id] = 0
for a,b in set_of_links:
    degrees[a] += 1
degrees_list = [v for v in degrees.values()]
print(f'Average outgoing degree: {mean(degrees_list):.2f} ({stdev(degrees_list):.2f})')
print(f'Max & min degree: {max(degrees_list)} & {min(degrees_list)}')
print(f'Number of isolated nodes: {len([i for i in degrees_list if i == 0])}')

Directed graph: 2297782 nodes
Undirected graph: 4132534 nodes

Edge homophily for class level 0: 0.34
Edge homophily for class level 1: 0.24
Edge homophily for class level 2: 0.15

Average outgoing degree: 85.19 (141.23)
Max & min degree: 7720 & 0
Number of isolated nodes: 16


In [3]:
from collections import Counter
import plotly.graph_objects as go


def plot_degree_distribution(
    degrees
):

    c= Counter(degrees)
    x_val = list(c.keys())
    y_val = [c[i] for i in x_val]
    print('Maximum degree: {}'.format(int(max(x_val))))
    print('Max node count with the same degree: {}'.format(int(max(y_val))))

    fig = go.Figure()

    fig.add_trace(go.Scatter(mode='markers', x=x_val, y=y_val ))

    fig.update_xaxes(type="log", range=[-0.1,2.5]) # log range: 10^0=1, 10^5=100000
    fig.update_yaxes(type="log", range=[-0.1,4]) # linear range
    fig.update_layout(
        height=500,
        width=1000,
        title_text="Log log distribution of degrees",
        xaxis_title='Node degree',
        yaxis_title='Node count')
    return fig

plot_degree_distribution(degrees_list)

Maximum degree: 7720
Max node count with the same degree: 644


In [7]:
class2_file = './outputs_save/__classes2.txt'

d_class = dict()
with open(class2_file, 'r', encoding='utf8') as f:
    for l in f:
        try:
            wikiid, cl2 = l.strip().split('\t')
            c0, c1, c2 =  cl2.split('->-') 
            if not c0 in d_class.keys():
                d_class[c0] = {c1:{c2:{'count': 0}, 'count':0}, 'count':0}
            elif not c1 in d_class[c0].keys():
                d_class[c0][c1] = {c2:{'count': 0}, 'count':0}
            elif not c2 in d_class[c0][c1].keys():
                d_class[c0][c1][c2] = {'count': 0}
        except:
            break

with open(class2_file, 'r', encoding='utf8') as f:
    for l in f:
        try:
            wikiid, cl2 = l.strip().split('\t')
            c0, c1, c2 =  cl2.split('->-') 
            d_class[c0]['count'] += 1
            d_class[c0][c1]['count'] += 1
            d_class[c0][c1][c2]['count'] += 1
        except:
            break

class_hierarchy = open('./outputs/__class_hierarchy.txt', 'w', encoding='utf8')

for k0 in sorted(d_class.keys()):
    if not k0 == 'count':
        class_hierarchy.write(f'{k0} ({d_class[k0]["count"]} articles)\n')
        for k1 in sorted(d_class[k0].keys()):
            if not k1 == 'count':
                class_hierarchy.write(f'\t\t{k1} ({d_class[k0][k1]["count"]} articles)\n')
                for k2 in sorted(d_class[k0][k1].keys()):
                    if not k2 == 'count':
                        class_hierarchy.write(f'\t\t\t\t{k2} ({d_class[k0][k1][k2]["count"]} articles)\n')




class_hierarchy.close()


print(d_class)

{'Arts ': {' Arts ': {' General': {'count': 5}, 'count': 3310, ' Architecture': {'count': 249}, ' Visual arts': {'count': 500}, ' Cultural venues': {'count': 131}, ' Literature': {'count': 989}, ' Music': {'count': 803}, ' Performing arts': {'count': 198}, ' Modern visual arts': {'count': 301}, ' Fictional characters': {'count': 134}}, 'count': 3310}, 'Philosophy and religion ': {' Philosophy and religion ': {' Abrahamic religions': {'count': 411}, 'count': 1408, ' Philosophy': {'count': 197}, ' Religion and spirituality': {'count': 170}, ' Eastern religions': {'count': 194}, ' Other religions': {'count': 71}, ' Mythology': {'count': 365}}, 'count': 1408}, 'Everyday life ': {' Sports, games and recreation ': {' Sports': {'count': 546}, 'count': 1231, ' Entertainment': {'count': 518}, ' Sports organizations': {'count': 167}}, 'count': 2422, ' Everyday life ': {' Cooking, food and drink': {'count': 654}, 'count': 1191, ' Clothing and fashion': {'count': 192}, ' General': {'count': 3}, ' 