In [10]:
%run ../utils/dataset_building.py

In [11]:
category_graph_file="../data/category_graph.tsv"
article_categories_file="../data/article_categories.tsv"

In [12]:
# key: parent category title, value: set of child categories
parent_children={}

with open(category_graph_file) as f:
    for line in f:
        split_line=line.split()
        node=split_line[0]
        parent_node=split_line[1]

        if parent_node not in parent_children.keys():
            parent_children[parent_node]=set()
            
        parent_children[parent_node].add(node)

In [13]:
def get_all_children(category):
    # returns the set of categories which are in the subtree of a given category
    
    # nodes that might have children that we'll need to get
    nodes_to_check=set(parent_children[category])
    # nodes that we already got
    checked_nodes=set()
    
    # the idea is to traverse the graph from a given category down to the leaves and get
    # all encountered categories while traversing
    while len(nodes_to_check)>0:
        new_nodes_to_check=set()
        for node in nodes_to_check:
            checked_nodes.add(node)
            if node in parent_children.keys():
                for child in parent_children[node]:
                    new_nodes_to_check.add(child)
        
        for node in new_nodes_to_check:
            nodes_to_check.add(node)
            
        for node in checked_nodes:
            if node in nodes_to_check:
                nodes_to_check.remove(node)
                
    return checked_nodes

In [14]:
# the goal is to remove categories such as "Articles with French-language external links"
# which are used only internally for wikipedia maintenance
categories_to_keep=get_all_children("Main_topic_classifications")

In [15]:
categories_to_keep.add("Main_topic_classifications")

In [21]:
nb_categories_before_filtering=len(set(parent_children.keys()).union(*parent_children.values()))

In [22]:
# number of categories before any filtering
nb_categories_before_filtering

2053941

In [23]:
# number of wikipedia maintenance categories
nb_categories_before_filtering-len(categories_to_keep)

306238

In [16]:
# number of categories after removing wikipedia maintenance categories
len(categories_to_keep)

1765286

In [24]:
# proportion of maintenance categories
(nb_categories_before_filtering-len(categories_to_keep))/nb_categories_before_filtering

0.14909775889375596

In [17]:
# remove categories marked as stubs
stub_categories=set([category for category in categories_to_keep if "stubs" in category.lower()])
categories_to_keep-=stub_categories

In [20]:
# number of stub categories
len(stub_categories)

17583

In [19]:
# remaining number of categories
len(categories_to_keep)

1747703

In [25]:
# proportion of stub categories
len(stub_categories)/nb_categories_before_filtering

0.008560615908636129

In [8]:
category_graph_filtered=[]
with open(category_graph_file) as f:
    for line in f:
        split=line.split()
        category=split[0]
        parent_category=split[1]
        if category in categories_to_keep and parent_category in categories_to_keep:
            category_graph_filtered.append((category,parent_category))

In [9]:
write_tuple_list_to_file(category_graph_filtered,"../data/category_graph_filtered.tsv")

In [10]:
del category_graph_filtered

In [11]:
# detect articles flagged as stubs
stub_articles=set()
with open(article_categories_file) as f:
    for line in f:
        split_line=line.strip().split()
        
        category=split_line[2]
        if "stubs" in category.lower():
            article_id=int(split_line[0])
            stub_articles.add(article_id)

In [12]:
# we also only keep articles in the article_categories file which belong to categories that
# are in the subtree of "Main topic classifications" and are not marked as stubs
article_categories_filtered=[]

with open(article_categories_file) as f:
    for line in f:
        split_line=line.strip().split()
        
        category=split_line[2]
        if category not in categories_to_keep:
            continue
            
        article_id=int(split_line[0])
        if article_id in stub_articles:
            continue
        
        article_title=split_line[1]

        article_categories_filtered.append((article_id,article_title,category))

In [13]:
write_tuple_list_to_file(article_categories_filtered,"../data/article_categories_filtered.tsv")