In [None]:
# in this notebook, the word "sentences" is used instead of "section contents" like in the report

In [2]:
%run ../utils/common.py

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from typing import List, Callable
from collections import Counter
import queue
import networkx as nx
from torch import Tensor
import torch
import pathlib

In [3]:
def create_model(device):
    return SentenceTransformer('paraphrase-distilroberta-base-v2',device=device)

In [4]:
def transform_to_embeddings(sentences,model):
    return model.encode(sentences, show_progress_bar=False, batch_size=32, convert_to_tensor=True)

In [5]:
#source: https://github.com/UKPLab/sentence-transformers/blob/10e1599339de3cefaedce91967275310c4c5dd82/sentence_transformers/util.py#L128
# modified version, the goal is to ignore pairs of similar sentences which are from the same
# section but in different articles or from the same article but in different sections
def paraphrase_mining_embeddings(embeddings: Tensor,
                      sentence_idx_section_map,
                      sentence_idx_article_map,
                      query_chunk_size: int = 5000,
                      corpus_chunk_size: int = 100000,
                      max_pairs: int = 500000,
                      top_k: int = 100,
                      score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
    """
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.
    :param embeddings: A tensor with the embeddings
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list of triplets with the format [score, id1, id2]
    """

    top_k += 1  # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs

    # Mine for duplicates
    pairs = queue.PriorityQueue()
    min_score = 0.5
    num_added = 0

    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
        for query_start_idx in range(0, len(embeddings), query_chunk_size):
            scores = score_function(embeddings[query_start_idx:query_start_idx+query_chunk_size], embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size])

            scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False)
            scores_top_k_values = scores_top_k_values.cpu().tolist()
            scores_top_k_idx = scores_top_k_idx.cpu().tolist()

            for query_itr in range(len(scores)):
                for top_k_idx, corpus_itr in enumerate(scores_top_k_idx[query_itr]):
                    i = query_start_idx + query_itr
                    j = corpus_start_idx + corpus_itr

                    if i != j and sentence_idx_section_map[i]!=sentence_idx_section_map[j] and sentence_idx_article_map[i]!=sentence_idx_article_map[j] and scores_top_k_values[query_itr][top_k_idx] > min_score:
                        pairs.put((scores_top_k_values[query_itr][top_k_idx], i, j))
                        num_added += 1

                        if num_added >= max_pairs:
                            entry = pairs.get()
                            min_score = entry[0]

    # Get the pairs
    added_pairs = set()  # Used for duplicate detection
    pairs_list = []
    while not pairs.empty():
        score, i, j = pairs.get()
        sorted_i, sorted_j = sorted([i, j])

        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
            added_pairs.add((sorted_i, sorted_j))
            pairs_list.append([score, i, j])

    # Highest scores first
    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
    return pairs_list

In [6]:
# the goal is to scan the file once for multiple communities, we cannot fit all the section contents at same time
def get_article_section_sentences_for_communities(community_sections,community_articles):
    articles_to_get=set().union(*community_articles.values())
    sections_to_get=set().union(*community_sections.values())
    
    article_section_sentences={}
    with open("../data/article_section_grouped_sentences.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            article_id=json_line['article_id']
            if article_id not in articles_to_get:
                continue
            section_sentences=[section_sentence for section_sentence in json_line['section_grouped_sentences'] if section_sentence['section'] in sections_to_get]
            
            already_added_sections=set()
            section_first_sentences=[]
            for section_sentence in section_sentences:
                del section_sentence['token_count']
                # we use only first grouped section content for each section
                if section_sentence['section'] not in already_added_sections:
                    section_first_sentences.append(section_sentence)
                already_added_sections.add(section_sentence['section'])

            article_section_sentences[article_id]=section_first_sentences
            
    return article_section_sentences

In [7]:
model=create_model("cuda")

I1128 23:34:39.012623 20776 SentenceTransformer.py:41] Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v2
I1128 23:34:39.016591 20776 SentenceTransformer.py:45] Did not find folder paraphrase-distilroberta-base-v2
I1128 23:34:39.017087 20776 SentenceTransformer.py:51] Search model on server: http://sbert.net/models/paraphrase-distilroberta-base-v2.zip
I1128 23:34:39.018079 20776 SentenceTransformer.py:107] Load SentenceTransformer from folder: C:\Users\Sergiy/.cache\torch\sentence_transformers\sbert.net_models_paraphrase-distilroberta-base-v2


In [4]:
# this is useful if the notebook is stopped and restarted
processed_communities=get_processed_communities()

In [9]:
pbar=tqdm(total=24694)
batch_id=0
with open("../data/community_article_and_sections_grouped_in_batches.json", "r") as f_in:
    for line in f_in:
        batch_id+=1
        
        print(f"batch {batch_id}")
        
        json_line=json.loads(line)
        community_sections={}
        community_articles={}
        
        for x in json_line['community_articles_grouped_by_batch']:
            community_articles[x['community_id']]=set(x['articles'])
        for x in json_line['community_sections_grouped_by_batch']:
            community_sections[x['community_id']]=set(x['sections'])
        
        article_section_sentences=get_article_section_sentences_for_communities(community_sections,community_articles)
        
        for community_id in community_sections.keys():
            if community_id in processed_communities:
                pbar.update(1)
                continue
            
            sentences=[]
            idx=0
            # key: idx of sentence in "sentences" list, value: section from which this sentence is
            sentence_idx_section_map={}
            # key: idx of sentence in "sentences" list, value: article_id from which this sentence is
            sentence_idx_article_map={}
            for article_id in community_articles[community_id]:
                if article_id in article_section_sentences.keys():
                    for section_sentence in article_section_sentences[article_id]:
                        section=section_sentence['section']
                        if section in community_sections[community_id]:
                            sentences.append(section_sentence['sentence'])
                            sentence_idx_section_map[idx]=section
                            sentence_idx_article_map[idx]=article_id
                            idx+=1
                        
            sentence_counter_by_section=Counter(sentence_idx_section_map.values())
            
            if len(sentences)==0:
                pbar.update(1)
                continue
            
            try:
                embeddings=transform_to_embeddings(sentences,model)
                paraphrases =paraphrase_mining_embeddings(embeddings,sentence_idx_section_map,sentence_idx_article_map)
            # when CUDA runs out of memory, this exception occurs, this happened only before we limited
            # the number of unique section contents inside each community, I'll keep it here if somebody's GPU
            # will have not enough memory, in this case rerun notebooks 2 and 3 after decreasing max_nb_sentences
            # in notebook 2
            except RuntimeError as e:
                print(e)
                model = create_model("cpu")
                embeddings=transform_to_embeddings(sentences,model)
                paraphrases =paraphrase_mining_embeddings(embeddings,sentence_idx_section_map,sentence_idx_article_map)
                model = create_model("cuda")
            
            if len(paraphrases)==0:
                pbar.update(1)
                continue
            
            pathlib.Path(f"../data/semantic_similarity/community_{community_id}").mkdir(parents=True, exist_ok=True) 
            
            section_id_map={}
            with open(f"../data/semantic_similarity/community_{community_id}/sentence_counter_by_section.json", "a+") as f_out:
                for i,(section, nb_sentences) in enumerate(sentence_counter_by_section.most_common()):
                    section_id_map[section]=i
                    f_out.write(json.dumps({'section':section,'id':i,'nb_sentences':nb_sentences})+"\n")
            
            section_pair_scores_dict={}
            for paraphrase in paraphrases:

                score, i, j = paraphrase
                
                section_A=section_id_map[sentence_idx_section_map[i]]
                section_B=section_id_map[sentence_idx_section_map[j]]

                sorted_pair=tuple(sorted((section_A,section_B)))

                if sorted_pair not in section_pair_scores_dict.keys():
                    section_pair_scores_dict[sorted_pair]=[]

                section_pair_scores_dict[sorted_pair].append(score)
                
            with open(f"../data/semantic_similarity/community_{community_id}/similar_section_pairs.json", "a+") as f_out:
                for (section_A_idx,section_B_idx),scores in section_pair_scores_dict.items():
                    mean_score=sum(scores)/len(scores)
                    nb_similar_sentences=len(scores)
                    f_out.write(json.dumps({'section_A':section_A_idx,'section_B':section_B_idx,'mean_score':mean_score,'nb_similar_sentences':nb_similar_sentences})+"\n")
            
            with open(f"../data/semantic_similarity/processed_communities", "a+") as f_out:
                f_out.write(str(community_id)+"\n")
            
            pbar.update(1)
        
        
                        

  0%|          | 0/24694 [00:00<?, ?it/s]

batch 1
batch 2
batch 3
batch 4
batch 5
