In [None]:
# in this notebook, the word "sentences" is used instead of "section contents" like in the report

In [1]:
%run ../utils/common.py

In [2]:
import networkx as nx
import community as community_louvain
from collections import Counter

In [3]:
def split_categories_into_connected_components(articles_categories_file,wcnoutput_file):
    
    # we get the set of article ids which have at least one section after filtering out unique sections
    articles_with_sections=set(load_json("../data/article_sections_filtered.json", object_hook=article_sections_object_hook).keys())

    # while in memory, we use category ids insead of their titles to save memory
    # (because the same category appears multiple times)
    category_title_to_id_map=get_category_title_to_id_map()
    category_id_to_title_map={v:k for k,v in category_title_to_id_map.items()}
            
    # we ignore "container categories" ie categories which are supposed to contain only subcategories
    # see https://en.wikipedia.org/wiki/Category:Container_categories
    # in the dataset from March 2021, there is one article in the category "Births" which is supposed
    # to be a container category
    # this avoids that the majority of categories are in the same connected component because of a minority
    # of articles which were not supposed to be in those categories have connected together many categories
    container_categories=set()
    with open("../data/category_graph.tsv", encoding="utf8") as f:
        for line in f:
            split_line = line.split()
            category=split_line[0]
            parent_category=split_line[1]
            if parent_category=="Container_categories":
                container_categories.add(category_title_to_id_map[category])
                
    # we ignore categories which are not ontologically pure enough,
    # because these will not provide sections for recommendation
    pure_categories=set()
    # key: category, value: articles which contribute to this category's section counts
    category_articles_wcn={}
    with open(wcnoutput_file, encoding="utf8") as f:
        for line in f:
            line_dict = json.loads(line)
            category = line_dict['category']
            category_id=category_title_to_id_map[category]
            pure_categories.add(category_id)
            articles=line_dict['articles']
            category_articles_wcn[category_id]=set(articles)

            
    # key: article id, value: list of categories to which article belongs
    article_categories={}
    with open(articles_categories_file, encoding="utf8") as f:
        for line in f:
            split_line = line.split()
            article_id = int(split_line[0])
            if article_id not in articles_with_sections:
                continue
            category = split_line[2]
            category_id=category_title_to_id_map[category]
            if category_id not in pure_categories:
                continue
            if category_id in container_categories:
                continue
            if article_id not in article_categories.keys():
                article_categories[article_id]=[]
            article_categories[article_id].append(category_id)

    g=nx.Graph()
    for article_id,categories in tqdm(article_categories.items()):
        if len(categories)==1:
            g.add_node(categories[0])
        else:
            categoryA=categories[0]
            # because we want only to detect connected component, we do not need to connect all the edges together
            # having all categories (B) for a given article connected to a single common category (A)
            # is sufficient for detecting connected components
            for categoryB in categories[1:]:
                g.add_edge(categoryA,categoryB)
    with open("../data/categories_splitted_into_connected_components.json", "a+") as f_out:
        for categories in tqdm(nx.connected_components(g)):
            category_names=[category_id_to_title_map[category_id] for category_id in categories]
            f_out.write(json.dumps({'categories':category_names})+"\n")
            
    return article_categories,category_title_to_id_map,category_id_to_title_map,category_articles_wcn

In [4]:
articles_categories_file="../data/article_categories_no_unknown_types.tsv"
wcnoutput_file="../data/gini_articles_scores_0985_no_unknown_type.json"

In [5]:
article_categories,category_title_to_id_map,category_id_to_title_map,category_articles_wcn=split_categories_into_connected_components(articles_categories_file,wcnoutput_file)

In [6]:
# key: category id, value: set of articles which belongs to this category
category_articles={}
for article,categories in tqdm(article_categories.items()):
    for category_id in categories:
        if category_id not in category_articles.keys():
            category_articles[category_id]=set()
        category_articles[category_id].add(article)

  0%|          | 0/1737697 [00:00<?, ?it/s]

In [7]:
# key: article_id, value: list of categories to which this article contribute with its sections
article_categories_wcn={}
for category,articles in category_articles_wcn.items():
    for article in articles:
        if article not in article_categories_wcn.keys():
            article_categories_wcn[article]=[]
        article_categories_wcn[article].append(category)

In [8]:
#key: article_id, value: sections in article
article_sections=load_json("../data/article_sections_filtered.json", object_hook=article_sections_object_hook)

#key: category_id, value: set of sections that will be included in recommendations for given categry
category_top_sections={category_title_to_id_map[category]:set(sections) for category,sections in json.loads(open("../data/category_top_sections.json", encoding="utf8").read()).items()}

In [9]:
# the critera used to stop spliting categories into communities is the total number of section contents
# inside a given category community
# we will use some of the biggest numbers of section contents in a the category that has the largest number
# of articles as threshold
def get_nb_sentences_by_category():
            
    # key: category, value: sum of nb sentences over all articles in category
    nb_sentences_by_category={}
    
    for category,articles in category_articles_wcn.items():
        if category not in category_articles.keys():
            continue
        nb_sentences_by_category[category]=0
        for article in articles:
            if article in article_sections.keys():
                for section in article_sections[article]:
                    if section in category_top_sections[category]:
                        nb_sentences_by_category[category]+=1
            
    return nb_sentences_by_category

In [10]:
nb_sentences_by_category=get_nb_sentences_by_category()

In [11]:
for category, nb_sentences in Counter(nb_sentences_by_category).most_common()[:50]:
    print(f"{category_id_to_title_map[category]}: {nb_sentences} sentences")

American_compositions_and_recordings: 219266 sentences
Rock_albums: 143489 sentences
Albums_by_American_artists: 137795 sentences
Intellectualism: 134236 sentences
2nd-millennium_deaths: 127792 sentences
Sportsmen: 125770 sentences
21st-century_albums: 100620 sentences
American_sportsmen: 92968 sentences
21st-century_songs: 88096 sentences
Populated_places_established_in_the_2nd_millennium: 82493 sentences
Works_set_in_outer_space: 81198 sentences
Films_set_in_outer_space: 78349 sentences
Solar_System_in_film: 77653 sentences
19th-century_births: 77158 sentences
Earth_in_film: 77154 sentences
20th-century_deaths: 75480 sentences
Works_about_cities: 74196 sentences
Pop_songs: 69435 sentences
Rock_albums_by_American_artists: 68780 sentences
21st-century_actors: 66235 sentences
American_mass_media_people: 64842 sentences
American_songs: 64149 sentences
Association_football_midfielders: 63057 sentences
20th-century_writers: 61032 sentences
21st-century_women: 60211 sentences
Local_politici

In [12]:
del nb_sentences_by_category

In [13]:
def get_sum_nb_sentences(category_ids):
    nb_sentences_in_community=0
    # category_articles: articles which contributes to the category section count
    articles_in_categories=set().union(*[category_articles_wcn[category_id] for category_id in category_ids])
    top_sections_in_categories=set().union(*[category_top_sections[category_id] for category_id in category_ids])
    for article in articles_in_categories:
        if article in article_sections.keys():
            for section in article_sections[article]:
                if section in top_sections_in_categories:
                    nb_sentences_in_community+=1
                
    return nb_sentences_in_community

In [14]:
categories=[]
# those categories seems to be of the same context among categories having the biggest numbers of section contents
# therefore we group them together and use the number of unique section contents inside this "manually grouped"
# community as maximum number of unique section contents as threshold
# if at notebook 4 your GPU has not enough memory, rerun notebooks 2 and 3 after decreasing max_nb_sentences
for category in ['American_compositions_and_recordings','Albums_by_American_artists','Rock_albums_by_American_artists','Rock_albums','21st-century_albums','21st-century_songs','Pop_songs','American_songs','Rock_songs','20th-century_songs','Jazz_albums']:
    categories.append(category_title_to_id_map[category])
max_nb_sentences=get_sum_nb_sentences(categories)
print(max_nb_sentences)

433150


In [15]:
def build_graph(categories,category_articles_dict,article_categories_dict):
    
    g=nx.Graph()

    # get set of all articles which belongs to at least one category in the current connected component/community
    articles_belonging_at_least_to_one_category=set().union(*[category_articles_dict[category_id] for category_id in categories])
    for article_id in articles_belonging_at_least_to_one_category:
        categories_to_connect_in_graph=[category_id for category_id in article_categories_dict[article_id] if category_id in categories]
        if len(categories_to_connect_in_graph)==1:
            g.add_node(categories_to_connect_in_graph[0])
        else:
            #   A B C
            # A   X X
            # B     X
            # C 
            # we iterate in such a way that if we have to connect categories A,B,C we only connect them once
            # and do not connect a category with itself
            # like shown with the "X" on the ascii schema above
            for i in range(len(categories_to_connect_in_graph)):
                for j in range(i+1,len(categories_to_connect_in_graph)):
                    catA=categories_to_connect_in_graph[i]
                    catB=categories_to_connect_in_graph[j]
                    if not g.get_edge_data(catA,catB):
                        g.add_edge(catA,catB,weight=1)
                    else:
                        old_weight=g.get_edge_data(catA,catB)['weight']
                        g.get_edge_data(catA,catB)['weight']=old_weight+1
    
    # normalize weights
    for edge in g.edges:
        catA=edge[0]
        catB=edge[1]
        old_weight=g.get_edge_data(catA,catB)['weight']
        n=len(category_articles_dict[catA].union(category_articles_dict[catB]))
        # the weight is probability that a randomly picked article in either catA or catB belongs to both categories
        # if we use article_categories and category_articles
        # or probability that a randomly picked article in either catA or catB contributes to category
        # section counts of both categories
        # if we use article_categories_wcn and category_articles_wcn
        # ie the weight is the Jaccard similarity
        g.get_edge_data(catA,catB)['weight']=old_weight/n
        
    return g

In [16]:
def split(g,resolution,f_out,first=False):
    global community_id
    partition=community_louvain.best_partition(g,resolution=resolution)
    
    com_category_ids={}
    for category_id,com in partition.items():
        if com not in com_category_ids.keys():
            com_category_ids[com]=set()
        com_category_ids[com].add(category_id)

    for com,category_ids in com_category_ids.items():
        sum_nb_sentences_in_commnity=get_sum_nb_sentences(category_ids)
        
        if sum_nb_sentences_in_commnity>max_nb_sentences and len(category_ids)>1:
            if len(category_ids)==2:
                for category_id in category_ids:
                    f_out.write(json.dumps({'community_id':community_id,'categories':[category_id_to_title_map[category_id]]})+"\n")
                    community_id+=1
            else:
                new_resolution=resolution
                if first:
                    # after first split, detect communities by using as weight the probability
                    # that a randomly picked article in either catA or catB contributes to category
                    # section counts of both categories
                    g2=build_graph(category_ids,category_articles_wcn,article_categories_wcn)
                    for category_id in category_ids:
                        del category_articles_wcn[category_id]
                else:
                
                    g2=nx.Graph()
                    for comA,comB in g.edges:
                        if comA in category_ids and comB in category_ids:
                            g2.add_edge(comA,comB,weight=g.get_edge_data(comA,comB)['weight'])

                    # if after 2 consecutives splits, the communities remained the same,
                    # it means that we have to lower the resolution parameter of the louvain algorithm
                    # if we want to split further
                    if len(g.nodes)==len(g2.nodes):
                        new_resolution=resolution-0.1
                split(g2,new_resolution,f_out)
        else:
            categories_in_community_names=[category_id_to_title_map[category_id] for category_id in category_ids]
            f_out.write(json.dumps({'community_id':community_id,'categories':categories_in_community_names})+"\n")
            community_id+=1

In [None]:
community_id=0
with open("../data/categories_splitted_into_communities.json", "a+") as f_out:
    with open("../data/categories_splitted_into_connected_components.json", "r") as f_in:
        for line in tqdm(f_in,total = 24061):
            json_line=json.loads(line)
            categories_in_connected_component=json_line['categories']

            if len(categories_in_connected_component)<=2 and get_sum_nb_sentences([category_title_to_id_map[category_name] for category_name in categories_in_connected_component])<=max_nb_sentences:
                f_out.write(json.dumps({'community_id':community_id,'categories':categories_in_connected_component})+"\n")
                community_id+=1
                continue
            # convert to ids to use less memory
            categories_in_connected_component=set([category_title_to_id_map[category_name] for category_name in categories_in_connected_component])

            # first split: detect communities by using as weight probability that a
            # randomly picked article in either catA or catB belongs to both categories
            g=build_graph(categories_in_connected_component,category_articles,article_categories)
            
            for category_id in categories_in_connected_component:
                del category_articles[category_id]
            
            split(g,1,f_out,first=True)

  0%|          | 0/24061 [00:00<?, ?it/s]