In [1]:
%run ../utils/dataset_building.py

In [None]:
# change it to True if you want to extract the sections from the 2017 dump in order to reproduce
# results from 2017 paper by computing category section counts instead of using the provided ones
reproducion_2017=False

In [2]:
from collections import Counter

In [3]:
article_sections_filename="../data/article_sections.json"

In [10]:
def filter_sections(article_sections_filename):
    # a list of 13 sections which are too broad for recommendation, defined in the original paper
    sections_to_ignore=["References","External links", "See also","Notes","Further reading","Bibliography","Sources","Footnotes","Notes and references","References and notes","External sources","Links","References and sources"]
    sections_to_ignore=set(sections_to_ignore)
    article_sections=load_all_article_sections(article_sections_filename)
    section_counter=Counter()
    
    for article,sections in article_sections.items():
        for section in sections:
            section_counter[section]+=1
            
    unique_sections=[section for section,count in section_counter.items() if count==1]
    unique_sections=set(unique_sections)
    print(f"Number of unique sections: {len(unique_sections)}")
    print(f"Proportion of sections that were used in only one article: {len(unique_sections)/len(section_counter)}")
    print(f"Number of remaining different sections: {len(section_counter)-len(unique_sections)}")
    articles_with_0_sections=[]
    for article_id,sections in article_sections.items():
        article_sections[article_id]=[section for section in sections if section not in unique_sections and section not in sections_to_ignore]
        if len(article_sections[article_id])==0:
            articles_with_0_sections.append(article_id)
    
    print(f"Number of articles which had 0 sections after filtering sections: {len(articles_with_0_sections)}")
    print(f"Proportion of articles that had no sections left after filtering: {len(articles_with_0_sections)/len(article_sections)}")
    for article_id in articles_with_0_sections:
        del article_sections[article_id]
    
    print(f"Number of articles after filtering: {len(article_sections)}")
    dump_json(article_sections,article_sections_filename.split(".json")[0]+"_filtered.json")

In [8]:
# number of unique sections
len(unique_sections)

1425477

In [9]:
# proportion of sections that were unique ie used in only one article
len(unique_sections)/len(section_counter)

0.8651345456134452

In [10]:
# number of remaining different sections
len(section_counter)-len(unique_sections)

222217

In [12]:
# number of articles which had 0 sections after removing unique and too broad sections
len(articles_with_0_sections)

588489

In [14]:
# proportion of articles that had no sections left after filtering
len(articles_with_0_sections)/len(article_sections)

0.194370900299241

In [15]:
# number of articles after filtering
len(article_sections)

3027660

In [13]:
article_sections_2017_filename="../data/article_sections_2017.json"
if reproducion_2017:
    filter_sections(article_sections_2017_filename)

Number of unique sections: 1378941
Proportion of sections that were used in only one article: 0.8652336262118652
Number of remaining different sections: 214780
Number of articles which had 0 sections after filtering sections: 1886067
Proportion of articles that had no sections left after filtering: 0.37015455268038505
Number of articles after filtering: 3209283
