In [13]:
%run ../utils/common.py

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.wikipedia
from collections import Counter


In [3]:
category_title_to_id_map=get_category_title_to_id_map()

In [4]:
category_section_counts={}
with open("../data/results/gini_threshold-0985_no_unknown_types-category_section_counts_with_test_set/recs_by_category_top30.json") as f:
    for line in f:
        line_dict = json.loads(line)
        category=line_dict['category']
        recs=line_dict['recs']
        if len(recs)==0:
            continue
        category_section_counts[category]={x['section']:x['probability'] for x in recs}

In [17]:

for category, counter in tqdm(category_section_counts.items()):
    category_id=category_title_to_id_map[category]
    db.category_section_counts.insert_one({'category':category,'category_search':category.replace("_"," "),'category_id':category_id,'recs':[{'section':section,'probability':probability} for section,probability in counter.items()]})
db.category_section_counts.create_index('category_id')
db.category_section_counts.create_index('category')
db.category_section_counts.create_index([('category_search', 'text')])

  0%|          | 0/779163 [00:00<?, ?it/s]

'category_search_text'

In [18]:
#key: parent category, value: set of children categories
parent_children={}

with open("../data/category_graph_filtered_DAG.tsv") as f:
    for line in f:
        split_line=line.split()
        child=split_line[0]
        parent=split_line[1]
        if parent not in parent_children.keys():
            parent_children[parent]=set()
        parent_children[parent].add(child)

In [19]:
categories_with_section_counts=set(category_section_counts.keys())

In [20]:
with open("../data/category_graph_filtered_DAG.tsv") as f:
    for line in f:
        split_line=line.split()
        child=split_line[0]
        parent=split_line[1]
        
        child_has_section_counts=False
        
        if child in categories_with_section_counts:
            child_has_section_counts=True

        child_is_parent=False
        if child in parent_children.keys() and len(categories_with_section_counts.intersection(parent_children[child]))>0:
            child_is_parent=True

        child_id=category_title_to_id_map[child]
        db.category_graph.insert_one({'child_is_parent':child_is_parent,'child_has_section_counts':child_has_section_counts,'child':child,'parent':parent,'child_id':child_id})

In [21]:
db.category_graph.create_index('parent')
db.category_graph.create_index('child')

'child_1'

In [14]:
processed_communities=get_processed_communities()

In [7]:
community_categories={}
with open("../data/categories_splitted_into_communities.json", "r") as f_in:
    for line in f_in:
        json_line=json.loads(line)
        community_id=json_line['community_id']
        if community_id not in processed_communities:
            continue
        categories=json_line['categories']
        community_categories[community_id]=categories
        for category in categories:
            db.category_community.insert_one({'community_id':community_id,'category_id':category_title_to_id_map[category]})

In [12]:
db.category_community.create_index("category_id")

'category_id_1'

In [5]:
article_sections=load_json("../data/article_sections_filtered.json", object_hook=article_sections_object_hook)

In [6]:
category_articles={}
with open("../data/gini_articles_scores_0985_no_unknown_type.json") as f:
    for line in f:
        line_dict = json.loads(line)
        category = line_dict['category']
        articles=[article for article in line_dict['articles'] if article in article_sections.keys()]
        category_articles[category]=articles

In [8]:
for community_id in tqdm(processed_communities):
    id_section_map={}
    with open(f"../data/semantic_similarity/community_{community_id}/sentence_counter_by_section.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            json_line['community_id']=community_id
            
            id_section_map[json_line['id']]=json_line['section']
            
    section_id_map={v:k for k,v in id_section_map.items()}
            
    categories=community_categories[community_id]
    section_articles={}
    for category in categories:
        for article in category_articles[category]:
            for section in article_sections[article]:
                if section not in section_id_map.keys():
                    continue
                if section not in section_articles.keys():
                    section_articles[section]=set()
                section_articles[section].add(article)
    
    for section,articles in section_articles.items():
        db.section_articles_by_community.insert_one({'community_id':community_id,'section':section,'articles':list(articles)})
    

  0%|          | 0/7917 [00:00<?, ?it/s]

In [9]:
db.section_articles_by_community.create_index('community_id')
db.section_articles_by_community.create_index('section')

'section_1'

In [13]:
for community_id in tqdm(processed_communities):
    id_section_map={}

    with open(f"../data/semantic_similarity/community_{community_id}/sentence_counter_by_section.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            json_line['community_id']=community_id
            
            id_section_map[json_line['id']]=json_line['section']
            
            db.sentence_counter_by_section_by_community.insert_one(json_line)
    
    
    categories=community_categories[community_id]
    section_articles={}
    for category in categories:
        for article in category_articles[category]:
            for section in article_sections[article]:
                if section not in section_articles.keys():
                    section_articles[section]=set()
                section_articles[section].add(article)
            
    
    with open(f"../data/semantic_similarity/community_{community_id}/similar_section_pairs.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            json_line['community_id']=community_id
            db.semantic_similar_section_pairs.insert_one(json_line)


  0%|          | 0/7917 [00:00<?, ?it/s]

In [14]:
db.semantic_similar_section_pairs.create_index('community_id')
db.semantic_similar_section_pairs.create_index('mean_score')
db.semantic_similar_section_pairs.create_index('section_A')
db.semantic_similar_section_pairs.create_index('section_B')

'section_B_1'

In [15]:
db.sentence_counter_by_section_by_community.create_index('community_id')
db.sentence_counter_by_section_by_community.create_index('section')

'section_1'

In [7]:

for category,recs in tqdm(category_section_counts.items()):
    category_id=category_title_to_id_map[category]
    
    section_order_numbers_beginning={}
    section_order_numbers_end={}
    
    top_sections=set(recs.keys())
    for article in category_articles[category]:
        sections=article_sections[article]
        n=len(sections)
        idx_beginning=0
        for section in sections:
            idx_end=idx_beginning+1
            if section in top_sections:
                if section not in section_order_numbers_beginning.keys():
                    section_order_numbers_beginning[section]=[]
                    section_order_numbers_end[section]=[]
                section_order_numbers_beginning[section].append(idx_beginning/n)
                section_order_numbers_end[section].append(idx_end/n)
            idx_beginning+=1
            
    for section in section_order_numbers_beginning.keys():
        number_values=len(section_order_numbers_beginning[section])
        sum_values_beginning=sum(section_order_numbers_beginning[section])
        sum_values_end=sum(section_order_numbers_end[section])
        db.section_ordering.insert_one({'category_id':category_id,'section':section,'number_values':number_values,'sum_values_beginning':sum_values_beginning,'sum_values_end':sum_values_end})

  0%|          | 0/779163 [00:00<?, ?it/s]

In [8]:
db.section_ordering.create_index('category_id')
db.section_ordering.create_index('section')

'section_1'

In [10]:
for category,articles in tqdm(category_articles.items()):
    if category not in category_section_counts.keys():
        continue
    nb_sections_counter=Counter()
    for article in articles:
        if article in article_sections.keys():
            nb_sections_counter[len(article_sections[article])]+=1
    category_id=category_title_to_id_map[category]
    db.category_article_lengths.insert_one({'category_id':category_id,'article_lengths':[{'length':k,'count':v} for k,v in nb_sections_counter.items()]})

  0%|          | 0/1447449 [00:00<?, ?it/s]

In [11]:
db.category_article_lengths.create_index('category_id')

'category_id_1'

In [7]:
article_categories={}
with open("../data/article_categories_no_unknown_types.tsv") as f:
    for line in f:
        split_line=line.split()
        article=split_line[1]
        category=split_line[2]
        if article not in article_categories.keys():
            article_categories[article]=[]
        if category in category_section_counts.keys():
            article_categories[article].append(category)

In [8]:
for article,categories in tqdm(article_categories.items()):
    if len(categories)>0:
        db.article_categories.insert_one({'categories':categories,'article_search':article.replace("_"," ")})
db.article_categories.create_index([('article_search', 'text')])

  0%|          | 0/2048191 [00:00<?, ?it/s]

'article_search_text'

In [None]:
for community_id in processed_communities:
    try:
        with open(f"../data/semantic_similarity/community_{community_id}/cosine_sim_thresholds.json", "r") as f_in:
            for line in f_in:
                json_line=json.loads(line)
                json_line['community_id']=community_id
                db.cosine_sim_thresholds_by_community.insert_one(json_line)
    #can occur if there is only a single pair of similar sentences
    except FileNotFoundError:
        pass

In [None]:
db.cosine_sim_thresholds_by_community.create_index('community_id')
db.cosine_sim_thresholds_by_community.create_index('semantic_filtering_level')