In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import spacy
import pytextrank

In [2]:
VOCAB_SIZE = 5000
SEQUENCE_LENGTH = 1200

Brass set described in the paper
1) descriptions that are very long (> 750 characters) or short (< 20 characters)
2) descriptions with high lexical overlap (over 40%) with their show description
3) descriptions with high lexical overlap (over 50%) with other episode descriptions

In [3]:
def brass_length(dataset, tokenizer, upp_bound=750, low_bound=20):
    """
    Returns a dataset with less samples removing ones having descriptions that are very long (> 750 characters)
    or short (< 20 characters)
    Parameters:
        - dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the description
        - upp_bound: maximum number of characters in the description
        - low_bound: minimum number of characters in the description
    Returns:
        - subset of the dataset applying the aforementioned filter
    """
    rows = dataset.shape[0]
    for i in range(rows):
        tokenized_description = tokenizer(dataset['episode_description'][i])
        tokenized_description = np.nonzero(tokenized_description.numpy())[0]
        if len(tokenized_description) > upp_bound or len(tokenized_description) < low_bound:
            dataset.drop([i], inplace=True)

    return dataset

# testing
data = [["transcript one", "this is the first summary of a podcast. I need to reach at least twenty " \
        "characters. It is not enough, I need to add other characters."], ['transcript 2', 'too short, it must be removed']]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['episode_transcript', 'episode_description'])

brass_length(df, vectorizer)


Unnamed: 0,episode_transcript,episode_description
0,transcript one,this is the first summary of a podcast. I need...


In [38]:
def overlapping_with_threshold(sentence_one, sentence_two, tokenizer, threshold):
    """
    Returns whether two sentences have an overlap over threshold
    Parameters:
        - sentence_one: first sentence
        - sentence_two: second sentence
        - tokenizer: tokenizer used to tokenize sentences
        - threshold: percentage usage to decide whether there is an overlapping sequence or not
    Returns:
        - True whether the sentences overlap, False otherwise
    """
    tokenized_one = tokenizer(sentence_one).numpy()
    tokenized_two = tokenizer(sentence_two).numpy()
    # index of meaningful tokens
    index_one = np.nonzero(tokenized_one)[0]
    index_two = np.nonzero(tokenized_two)[0]
    # meaningful tokens
    tok_sen_one = tokenized_one[index_one]
    tok_sen_two = tokenized_two[index_two]

    if len(tok_sen_one) > len(tok_sen_two):
        short_sen = tok_sen_two
        long_sen = tok_sen_one
    else:
        short_sen = tok_sen_one
        long_sen = tok_sen_two
    # it is possible to be sure it will not overlap
    if len(short_sen) / len(long_sen) <= threshold:
            return False
    else:
        # comparison of all possible matches
        j = 0
        while j <= len(long_sen) - len(short_sen):
            # comparison between episode description and show description
            common_index = (long_sen[j:j+len(short_sen)] == short_sen)
            common_index = common_index[common_index == True]
            overlap = len(common_index) / len(long_sen)
            if overlap > threshold:
                return True
            else:
                j += 1
        return False

In [39]:
def brass_overlap_show(dataset, tokenizer, percentage=0.4):
    """
    Delete from the dataset podcast having a description which overlaps over percentage the show description
    Parameters:
        - dataset: dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the description and the show description
        - percentage: threshold to discard the podcast
    Returns:
        - cleaned dataset
    """
    rows = dataset.shape[0]
    for i in range(rows):
        overlap = overlapping_with_threshold(
            dataset['episode_description'][i],
            dataset['show_description'][i],
            tokenizer,
            percentage
        )
        if overlap:
            dataset.drop([i], inplace=True)
    return dataset

# Testing
data = [["this is gonna be an overlapping", "this is gonna be an overlapping, trust me"],
        ["this will not be an overlapping", "this will not be an overlapping sentence because the content is much different between the two descriptions"]]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['show_description', 'episode_description'])

brass_overlap_show(df, vectorizer)

Unnamed: 0,show_description,episode_description
1,this will not be an overlapping,this will not be an overlapping sentence becau...


In [41]:
def brass_overlap_descriptions(dataset, tokenizer, percentage=0.5):
    """
    Delete from the dataset podcast having a description which overlaps over percentage another description
    Parameters:
        - dataset: dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the descriptions
        - percentage: threshold to discard the podcast
    Returns:
        - cleaned dataset
    """
    rows = dataset.shape[0]
    deleted_podcast_idx = set()
    for i in range(rows):
        # if the podcast has been already excluded, do not consider it
        if i in deleted_podcast_idx:
            continue
        else:
            for j in range(i+1, rows):
                if j in deleted_podcast_idx:
                    continue
                else:
                    overlap = overlapping_with_threshold(
                        dataset['episode_description'][i],
                        dataset['episode_description'][j],
                        tokenizer,
                        percentage
                    )
                    if overlap > percentage:
                        deleted_podcast_idx.add(j)

    dataset.drop(list(deleted_podcast_idx), inplace=True)
    return dataset

# Testing
data = ["this will be an overlap", "this will be a overlap, pretty sure", "this will not be removed"]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['episode_description'])

brass_overlap_descriptions(df, vectorizer)       


Unnamed: 0,episode_description
0,this will be an overlap
2,this will not be removed


TextRank algorithm to filter the amount of sentences using a variable number of top sentences to 1 / 3 as described in the original paper

In [10]:
def text_rank_sentences(document, coefficient=0.3):
    """
    TextRank algorithm used to extract the most important sentences from a document
    Parameters:
        - document: document to reduce the amount of sentences from
        - coefficient: the number of kept sentences is coefficient * number of sentences in document
    Returns:
        - Reduced version of the document
    """
    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load("en_core_web_sm")
    # add PyTextRank to the spaCy pipeline
    nlp.add_pipe("textrank")

    doc = nlp(document)
    # examine the top-ranked phrases in the document
    n = round(len(doc._.phrases) * coefficient)
    phrases = [phrase.text for phrase in doc._.phrases]
    phrases = phrases[:n]
    extr_summary = ""
    for phrase in phrases:
        extr_summary += phrase + ". "

    return extr_summary
    

# Testing
doc = """I'm being passive-aggressive. I really want to tell them to shut the fuck up but I don't want to do that.
  So I'm hoping that they understand me put my earpiece back in my ear lets them know motherfucker.
  I don't want to talk to you and then you'll see them they'll still be talking and then you take the other
  piece out and be like, oh, okay. All right, and then you put this  You see this in the gym, sometimes two
  motherfuckers come up talking to you when you if you go up help to somebody guys and they try to put the
  earpiece back in. Why are you talking to them? That means shut the fuck up. All right guys, passive-aggressively.
  All right guys, so women are do all those type of things guys to avoid confrontation to not hurt your feelings or
  whatever. That's past my girls now in a relationship a woman does different type of passive aggressive.
  All right.  A woman pulls back. All right, if you're dating a woman she pulls back that's passive aggressive guys.
  That's not done indirectly. All right, some of us men do it too when we started like when you hit the as a few
  times to minute well before you was, you know, texting her and ask her out and shit now, you ain't doing it you
  pulling back a little bit you hoping she get the hint. All right, a pullback is a passive-aggressive way of
  letting the person know I'm losing fucking interest now. Sometimes it's done hoping that the person
  You know taste a shit up because they ain't on point and sometimes it's done cause you really tired of
  motherfucking and you want them to get the hand get the fuck on. Alright guys. So if you date a woman she
  passive-aggressive if you laying in bed with a woman and you live with her and she sleeps with her back turned
  to you guys that's passive aggressive and letting you know that I'm losing interest. Oh, I ain't got much issues
  in you right now all that's passive aggressive guys. Now, there's levels to passive-aggressive.  First a woman is
  passive aggressive. That means she pulls back. She does things to show you that she's losing interest that's
  passive aggressive guys. All right silent treatment all this shit all that's done passive-aggressive if you don't
  pick up on it, or if you pick up on it, and you start acting insecure about it."""

text_rank_sentences(doc)

'passive aggressive guys. things guys. Alright guys. fucking interest. interest. women. a passive-aggressive way. different type. motherfucker. All right guys. much issues. bed. somebody guys. you guys. confrontation. things. point. a few times. '