In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
VOCAB_SIZE = 5000
SEQUENCE_LENGTH = 1200

Brass set described in the paper
1) descriptions that are very long (> 750 characters) or short (< 20 characters)
2) descriptions with high lexical overlap (over 40%) with their show description
3) descriptions with high lexical overlap (over 50%) with other episode descriptions

In [3]:
def brass_length(dataset, tokenizer, upp_bound=750, low_bound=20):
    """
    Returns a dataset with less samples removing ones having descriptions that are very long (> 750 characters)
    or short (< 20 characters)
    Parameters:
        - dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the description
        - upp_bound: maximum number of characters in the description
        - low_bound: minimum number of characters in the description
    Returns:
        - subset of the dataset applying the aforementioned filter
    """
    rows = dataset.shape[0]
    for i in range(rows):
        tokenized_description = tokenizer(dataset['episode_description'][i])
        tokenized_description = np.nonzero(tokenized_description.numpy())[0]
        if len(tokenized_description) > upp_bound or len(tokenized_description) < low_bound:
            dataset.drop([i], inplace=True)

    return dataset

# testing
data = [["transcript one", "this is the first summary of a podcast. I need to reach at least twenty " \
        "characters. It is not enough, I need to add other characters."], ['transcript 2', 'too short, it must be removed']]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['episode_transcript', 'episode_description'])

brass_length(df, vectorizer)


Unnamed: 0,episode_transcript,episode_description
0,transcript one,this is the first summary of a podcast. I need...


In [35]:
def overlapping_with_threshold(sentence_one, sentence_two, tokenizer, threshold):
    """
    Returns whether two sentences have an overlap over threshold

    """
    tokenized_one = tokenizer(sentence_one).numpy()
    tokenized_two = tokenizer(sentence_two).numpy()
    # index of meaningful tokens
    index_one = np.nonzero(tokenized_one)[0]
    index_two = np.nonzero(tokenized_two)[0]
    # meaningful tokens
    tok_sen_one = tokenized_one[index_one]
    tok_sen_two = tokenized_two[index_two]

    if len(tok_sen_one) > len(tok_sen_two):
        short_sen = tok_sen_two
        long_sen = tok_sen_one
    else:
        short_sen = tok_sen_one
        long_sen = tok_sen_two
    # it is possible to be sure it will not overlap
    if len(short_sen) / len(long_sen) <= threshold:
            return False
    else:
        # comparison of all possible matches
        j = 0
        while j <= len(long_sen) - len(short_sen):
            # comparison between episode description and show description
            common_index = (long_sen[j:j+len(short_sen)] == short_sen)
            common_index = common_index[common_index == True]
            overlap = len(common_index) / len(long_sen)
            if overlap > threshold:
                return True
            else:
                j += 1
        return False

In [18]:
def brass_overlap_show(dataset, tokenizer, percentage=0.4):
    """
    Delete from the dataset podcast having a description which overlaps over percentage the show description
    Parameters:
        - dataset: dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the description and the show description
        - percentage: threshold to discard the podcast
    Returns:
        - cleaned dataset
    """
    rows = dataset.shape[0]
    for i in range(rows):
        overlap = overlapping_with_threshold(
            dataset['episode_description'][i],
            dataset['show_description'][i],
            tokenizer,
            percentage
        )
        if overlap:
            dataset.drop([i], inplace=True)
    return dataset

# Testing
data = [["this is gonna be an overlapping", "this is gonna be an overlapping, trust me"],
        ["this will not be an overlapping", "this will not be an overlapping sentence because the content is much different between the two descriptions"]]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['show_description', 'episode_description'])

brass_overlap_show(df, vectorizer)

Unnamed: 0,show_description,episode_description
1,this will not be an overlapping,this will not be an overlapping sentence becau...


In [36]:
def brass_overlap_descriptions(dataset, tokenizer, percentage=0.5):
    """
    Delete from the dataset podcast having a description which overlaps over percentage another description
    Parameters:
        - dataset: dataset: pandas dataframe containing transcript and summary for each podcast
        - tokenizer: tokenizer to use for tokenize the descriptions
        - percentage: threshold to discard the podcast
    Returns:
        - cleaned dataset
    """
    rows = dataset.shape[0]
    deleted_podcast_idx = set()
    for i in range(rows):
        # if the podcast has been already excluded, do not consider it
        if i in deleted_podcast_idx:
            continue
        else:
            for j in range(i+1, rows):
                if j in deleted_podcast_idx:
                    continue
                else:
                    overlap = overlapping_with_threshold(
                        dataset['episode_description'][i],
                        dataset['episode_description'][j],
                        tokenizer,
                        percentage
                    )
                    print(f"{dataset['episode_description'][i]} {dataset['episode_description'][j]} overlap: {overlap}")
                    if overlap > percentage:
                        deleted_podcast_idx.add(j)

    dataset.drop(list(deleted_podcast_idx), inplace=True)
    return dataset

# Testing
data = ["this will be an overlap", "this will be a overlap, pretty sure", "this will not be removed"]

vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
vectorizer.adapt(np.array(data).flatten())

df = pd.DataFrame(data, columns=['episode_description'])

brass_overlap_descriptions(df, vectorizer)       


[ 3  2  4 10  5]
[ 3  2  4 11  5  8  6]
[ True  True  True  True]
0.5714285714285714
this will be an overlap this will be a overlap, pretty sure overlap: True
[ 3  2  4 10  5]
[3 2 9 4 7]
[ True  True]
0.4
this will be an overlap this will not be removed overlap: False


Unnamed: 0,episode_description
0,this will be an overlap
2,this will not be removed
