In [7]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from typing import List, Callable
from collections import Counter
import queue
import networkx as nx
from torch import Tensor
import torch
import pathlib

In [8]:
%run ../utils/common.py

In [9]:
def create_model(device):
    return SentenceTransformer('paraphrase-distilroberta-base-v2',device=device)
def transform_to_embeddings(sentences,model):
    return model.encode(sentences, show_progress_bar=True, batch_size=32, convert_to_tensor=True)
#source: https://github.com/UKPLab/sentence-transformers/blob/10e1599339de3cefaedce91967275310c4c5dd82/sentence_transformers/util.py#L128
# modified version, the goal is to ignore pairs of similar sentences which are from the same section but in different articles or from the same article but in different sections
def paraphrase_mining_embeddings(embeddings: Tensor,
                      sentence_idx_section_map,
                      sentence_idx_article_map,
                      query_chunk_size: int = 5000,
                      corpus_chunk_size: int = 100000,
                      max_pairs: int = 500000,
                      top_k: int = 100,
                      score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
    """
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.
    :param embeddings: A tensor with the embeddings
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :param score_function: Function for computing scores. By default, cosine similarity.
    :return: Returns a list of triplets with the format [score, id1, id2]
    """

    top_k += 1  # A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs

    # Mine for duplicates
    pairs = queue.PriorityQueue()
    min_score = 0.5
    num_added = 0

    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
        for query_start_idx in range(0, len(embeddings), query_chunk_size):
            scores = score_function(embeddings[query_start_idx:query_start_idx+query_chunk_size], embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size])

            scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False)
            scores_top_k_values = scores_top_k_values.cpu().tolist()
            scores_top_k_idx = scores_top_k_idx.cpu().tolist()

            for query_itr in range(len(scores)):
                for top_k_idx, corpus_itr in enumerate(scores_top_k_idx[query_itr]):
                    i = query_start_idx + query_itr
                    j = corpus_start_idx + corpus_itr

                    if i != j and sentence_idx_section_map[i]!=sentence_idx_section_map[j] and sentence_idx_article_map[i]!=sentence_idx_article_map[j] and scores_top_k_values[query_itr][top_k_idx] > min_score:
                        pairs.put((scores_top_k_values[query_itr][top_k_idx], i, j))
                        num_added += 1

                        if num_added >= max_pairs:
                            entry = pairs.get()
                            min_score = entry[0]

    # Get the pairs
    added_pairs = set()  # Used for duplicate detection
    pairs_list = []
    while not pairs.empty():
        score, i, j = pairs.get()
        sorted_i, sorted_j = sorted([i, j])

        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
            added_pairs.add((sorted_i, sorted_j))
            pairs_list.append([score, i, j])

    # Highest scores first
    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
    return pairs_list
# the goal is to scan the file once for multiple communities
def get_article_section_sentences_for_communities(community_sections,community_articles):
    articles_to_get=set().union(*community_articles.values())
    sections_to_get=set().union(*community_sections.values())
    
    article_section_sentences={}
    with open("../data/article_section_grouped_sentences.json", "r") as f_in:
        for line in f_in:
            json_line=json.loads(line)
            article_id=json_line['article_id']
            if article_id not in articles_to_get:
                continue
            section_sentences=[section_sentence for section_sentence in json_line['section_grouped_sentences'] if section_sentence['section'] in sections_to_get]
            
            already_added_sections=set()
            section_first_sentences=[]
            for section_sentence in section_sentences:
                del section_sentence['token_count']
                if section_sentence['section'] not in already_added_sections:
                    section_first_sentences.append(section_sentence)
                already_added_sections.add(section_sentence['section'])

            article_section_sentences[article_id]=section_first_sentences
            
    return article_section_sentences

In [10]:
model=create_model("cuda")

I1219 20:09:27.595901 38468 SentenceTransformer.py:41] Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v2
I1219 20:09:27.596893 38468 SentenceTransformer.py:45] Did not find folder paraphrase-distilroberta-base-v2
I1219 20:09:27.597885 38468 SentenceTransformer.py:51] Search model on server: http://sbert.net/models/paraphrase-distilroberta-base-v2.zip
I1219 20:09:27.598877 38468 SentenceTransformer.py:107] Load SentenceTransformer from folder: C:\Users\Sergiy/.cache\torch\sentence_transformers\sbert.net_models_paraphrase-distilroberta-base-v2


In [13]:
pbar=tqdm(total=24694)
batch_id=0
with open("../data/community_article_and_sections_grouped_in_batches.json", "r") as f_in:
    for line in f_in:
        batch_id+=1
        print(f"batch {batch_id}")
        if batch_id<2:
            continue
        
        
        json_line=json.loads(line)
        community_sections={}
        community_articles={}
        
        for x in json_line['community_articles_grouped_by_batch']:
            community_articles[x['community_id']]=set(x['articles'])
        for x in json_line['community_sections_grouped_by_batch']:
            community_sections[x['community_id']]=set(x['sections'])
        
        article_section_sentences=get_article_section_sentences_for_communities(community_sections,community_articles)
        
        for community_id in community_sections.keys():
            #if community_id !=67:
            #if community_id !=1019:
            if community_id !=1014:
                pbar.update(1)
                continue
            
            sentences=[]
            idx=0
            # key: idx of sentence in "sentences" list, value: section from which this sentence is
            sentence_idx_section_map={}
            # key: idx of sentence in "sentences" list, value: article_id from which this sentence is
            sentence_idx_article_map={}
            for article_id in community_articles[community_id]:
                if article_id in article_section_sentences.keys():
                    for section_sentence in article_section_sentences[article_id]:
                        section=section_sentence['section']
                        if section in community_sections[community_id]:
                            sentences.append(section_sentence['sentence'])
                            sentence_idx_section_map[idx]=section
                            sentence_idx_article_map[idx]=article_id
                            idx+=1
                        
            sentence_counter_by_section=Counter(sentence_idx_section_map.values())
            
            if len(sentences)==0:
                pbar.update(1)
                continue
            
            try:
                embeddings=transform_to_embeddings(sentences,model)
                paraphrases =paraphrase_mining_embeddings(embeddings,sentence_idx_section_map,sentence_idx_article_map)
            except RuntimeError as e:
                print(e)
                model = create_model("cpu")
                embeddings=transform_to_embeddings(sentences,model)
                paraphrases =paraphrase_mining_embeddings(embeddings,sentence_idx_section_map,sentence_idx_article_map)
                model = create_model("cuda")
            
            if len(paraphrases)==0:
                pbar.update(1)
                continue
            
            
            

                
            
            pbar.update(1)
            break
        
        
                        

  0%|          | 0/24694 [00:00<?, ?it/s]

batch 1
batch 2


Batches:   0%|          | 0/5845 [00:00<?, ?it/s]

batch 3


KeyboardInterrupt: 

In [15]:
for paraphrase in paraphrases:

    score, i, j = paraphrase

    section_A=sentence_idx_section_map[i]
    section_B=sentence_idx_section_map[j]
    if len(set([section_A,section_B]).intersection(set(['Plot','Story','Synopsis'])))>1:
        print(score)
        print(sentence_idx_article_map[i])
        print(section_A)
        print(sentences[i])
        print()
        print(sentence_idx_article_map[j])
        print(section_B)
        print(sentences[j])
        print("---")

0.9644277691841125
1678684
Plot
When the evil Black Knight terrorizes the townspeople, Prince Duncan decides to topple his throne, but in order to do that, he must travel to the four sections of the castle: Fireball, Shield, Trouble and Black Knight. After collecting the Fireball and Shield, Duncan makes his way to the Black Knight's throne room, where he topples the Black Knight's throne, and the Black Knight stands up shaking his fist, as a gargoyle takes Duncan to Trouble 3.

6274929
Story
The evil Black Knight terrorizes the townspeople, our hero Prince Duncan decides to topple his throne, but in order to do that, he must travel to the four sections of the castle: Fireball, Shield, Trouble, and Black Knight. After collecting the Fireball, and Shield, Duncan makes his way to The Black Knight's Throne room, where he topples the Black Knight's Throne. On Novice, Beginner, and Intermediate the Black Knight stands up shaking his fist, as a gargoyle takes Duncan to Trouble 3. On advanced

0.7457394599914551
548259
Plot
In 1887, Transylvania, Dr. Victor Frankenstein, with help from his assistant Igor and Count Dracula, successfully creates a monster. Dracula, a vampire, kills Frankenstein to use the creature for his own purposes. As an angry mob storms Castle Frankenstein, the monster flees to a windmill with his dead creator. The mob burn down the windmill, apparently destroying the monster. One year later, monster hunter Van Helsing kills Mr. Hyde after a brawl in Notre-Dame de Paris. Van Helsing suffers from amnesia, slaying evil on behalf of the Vatican City, hoping that he will earn redemption for forgotten sins. He is tasked by Cardinal Jinette to go to Transylvania and destroy Dracula. He must also protect the last members of an ancient Romanian family, the Valerious, whose ancestor vowed that his descendants would kill Dracula, or fall into Purgatory. He receives a torn parchment, reading "In the name of God, open this door" in Latin. Van Helsing travels to Trans

0.71934574842453
2172792
Plot
The story is set in the year 2704, when the Alliance of Space-Faring Alien Races (ASFAR), of which Earth is a member, suddenly turns against Earth and their fleet ravages the planet, starting a war. The player flies a powerful starfighter, the TV-202, in a series of missions to defeat the enemy. In Episode 3, the player learns that a huge supercomputer known as Xenocidic Initiative (X.I. ), located on Proxima Seven, is responsible for the war. Their final mission is to eliminate it. A hidden mission can take place after the main plot only in the CD ROM version where the player must investigate a sudden metamorphosis of an unknown nearby planet and destroy the force that changed the face of the planet. It is revealed here that this force drove a man named Sy Wickens into insanity, and how the X.I. Supercomputer had "accidentally" digitized Sy Wickens' persona.

22007284
Synopsis
In the year 2049, the Earth moon base Yaz 67 is destroyed by an alien battlecru

0.7015678882598877
26745297
Plot
Avalon is set in Britain in the year 408, during the collapse of the Western Roman Empire. The player controls Maroc, a "lore-seeker" who has been given a staff and map by a strange old woman and pointed in the direction of a place called Glass Hill on the isle of Avalon, where a quest to defeat the Lord of Chaos begins. The name "Avalon" is taken from King Arthur's legendary resting place, the isle of Avalon, while a figure named Avallach features in Welsh mythology. Other than this and the time period the game is set in, there is little connection to the Arthurian legend.

43948986
Synopsis
Setting and characters. The Legend of Legacy takes place entirely on the island of Avalon, which was discovered ten years before the game's opening and identified with a legendary lost continent whose inhabitants walked alongside gods; the island, while now mostly wild and dominated by monsters, housed ruins crafted with the assistance of natural spirits dubbed "El

0.9505953192710876
https://en.wikipedia.org/wiki/Love!_Valour!_Compassion!
1580900
Plot
The setting is at a lakeside summer vacation house in Dutchess County, two hours north of New York City where eight gay friends spend the three major holiday weekends of one summer together for Memorial Day, Independence Day, and Labor Day. The house belongs to Gregory, a successful Broadway choreographer now approaching middle age, who fears he is losing his creativity; and his twenty-something lover, Bobby, a legal assistant who is blind. Each of the guests at their house is connected to Gregory's work in one way or another - Arthur and longtime partner Perry are business consultants; John Jeckyll, a sour Englishman, is a dance accompanist; die-hard musical theater fanatic Buzz Hauser is a costume designer and the most stereotypically gay man in the group. Only John's summer lover, Ramon, and John's twin brother James are outside the circle of friends. But Ramon is outgoing and eventually makes a place for himself in the group, and James is such a gentle soul that he is quickly welcomed.

https://en.wikipedia.org/wiki/Love!_Valour!_Compassion!_(film)
18168131
Synopsis
The story of eight gay male friends who spend the three major holiday weekends of one summer (Memorial Day, the Fourth of July, and Labor Day) together at a lakeside house in Dutchess County, New York in the mid 1990s. The house belongs to Gregory, a successful Broadway choreographer now approaching middle age, who fears he is losing his creativity, and his twenty-something lover Bobby, a legal assistant who is blind. Each of the guests at their house is connected to Gregory's work in one way or another. Arthur and his longtime partner Perry are business consultants; John Jeckyll, a sour and promiscuous Englishman, is a dance accompanist; and die-hard musical theater fanatic Buzz Hauser is a costume designer and the most stereotypically gay man in the group. Only John's summer lover Ramon and twin brother James are outside the circle of friends. Ramon is outgoing and eventually makes a place for himself in the group, while James is such a gentle soul that he is quickly welcomed. Infidelity, flirting, AIDS, skinny-dipping, truth-telling, and soul-searching mix questions about life and death with a dress rehearsal for Swan Lake performed in drag.

0.6953703761100769
https://en.wikipedia.org/wiki/Nurse_Edith_Cavell
30816626
Plot
The story follows the broadly true story of Edith Cavell who went to German-occupied Brussels after the onset of the First World War. Edith hides the young Frenchman Jean Rappard, but is suspected of this and her hospital is inspected by German troops at regular intervals. Jean is put on a canal barge and despite being searched at the border escapes successfully. Back in Brussels a firing squad executes a dozen escaped prisoners who were caught in the woods. Edith and albert go to try to find wounded on a battlefield near the woods and bring back four British men including Pt. Bungey of the Buffs. They are hidden in the hospital in a secret room accessed through a wardrobe in the basement boiler room. The Countess goes to the cobbler to organise their safe transportation. Meanwhile Edith also tends the young dying Germans in the main hospital. A further three Frenchmen are sent to the border by barge with Mme Moulin. An alleged escaped French PoW arrives at the Countess's mansion. The Countess is suspicious due to his accent and locks him in the kitchen whilst informing the German authorities. The hospital is also being watched. Nevertheless the numbers increase... but they include Wilhelm Schultz of the German military intelligence.

https://en.wikipedia.org/wiki/The_Martyrdom_of_Nurse_Cavell
32880476
Synopsis
The story is told in four parts. The film starts at the English home of Edith Cavell before the war, then jumps forward six years to a Belgium hospital, where Cavell is working. The war is about to start and Dr Schultz suggests Nurse Cavell return home but she refuses, saying her place is with the sick. She gets an invitation to the wedding of two friends, Lt Renard and Yvonne Loudet. Herr Cries is also invited; he pretends to be a medical student but is in fact a foreign spy and is a rejected suitor of Yvonne. He forces himself on her but Lt Renard knocks him out and Cries departs, swearing vengeance. The wedding ends when everyone gets news that war has been declared and Renard goes to military headquarters. Four months later Brussels has been occupied by the Germans and Cavell is tending wounded British, German and Belgium soldiers. Lt Renard has been captured and imprisoned by the Germans. He makes an escape with the help of friends and visits his wife and parents. Yvonne asks Nurse Cavell to help them escape the country.