## Chunk segmentation

In [139]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import pandas as pd
import regex as re
import json
import pysbd
from sentence_transformers import SentenceTransformer, util

In [4]:
dataset_path = os.path.join(os.path.abspath(""), 'podcasts-no-audio-13GB')

metadata_path_train = os.path.join(dataset_path, 'metadata.tsv')
metadata_train = pd.read_csv(metadata_path_train, sep='\t')

def get_path(episode):
    # extract the 2 reference number/letter to access the episode transcript
    show_filename = episode['show_filename_prefix']
    episode_filename = episode['episode_filename_prefix'] + ".json"
    dir_1, dir_2 = re.match(r'show_(\d)(\w).*', show_filename).groups()

    # check if the transcript file in all the derived subfolders exist
    transcipt_path = os.path.join(dataset_path, "spotify-podcasts-2020",
                                "podcasts-transcripts", dir_1, dir_2,
                                show_filename, episode_filename)

    return transcipt_path

def get_transcription(episode):
    with open(get_path(episode), 'r') as f:
        episode_json = json.load(f)
        # seems that the last result in each trastcript is a repetition of the first one, so we ignore it
        transcripts = [
            result["alternatives"][0]['transcript'] if 'transcript' in result["alternatives"][0] else ""
            for result in episode_json["results"][:-1]
        ]
        return " ".join(transcripts)

In [59]:
medium_idx = 909
long_idx = 5211
get_transcription(metadata_train.iloc[medium_idx])

"What's up, everybody? Welcome to the cycle podcast our very first episode. I'm your host Melissa Boudreaux. I'm so grateful that you're here with us today. Thank you so much for listening. This podcast is going to be centralized around talking about endometriosis, which is an autoimmune disorder that I suffer from for about 20 years now, and I've been wanting to make a podcast because I really do feel there needs to be more awareness and education around this disease. There are other podcasts out there about endometriosis and my idea really for this podcast is to have some informative and educational episodes but also to interview someone different every single month the goal is to have an interview or a podcast every 28 days to go along with the cycle. Hence the name that women have every 28 days. I thought it would be fun. I thought it would be creative and it would give me a way to help spread awareness endometriosis if you don't know about it is a reproduction.  Active disease tha

In [60]:
get_transcription(metadata_train.iloc[long_idx])

" Hi and welcome to date me or hate me a podcast all about dating The Good the Bad and the Ugly it will all come out here. So let's get into why I'm here today. The reason I actually started this podcast is because I had a few friends coming to me for dating and relationship advice and after hearing the positive feedback that they had for the advice I would Giving them I figured why not give the advice to everyone who might need it. And here we are. So the reason I came up with the name date me or hate me is because me as a person that's kind of how I am. I'm all in like you're going to get me and you either like it or you don't and that's normally how things go. Normally. It's  other guys like how honest I am or they absolutely hate it and that's totally fine as the saying goes. You're not going to be everyone's cup of tea. I'm sure I'm probably someone's cup of bleach somewhere out there. Probably more than one that's for sure. But I will say that that is how I'm going to be on this 

In [175]:
def look_ahead_chuck(sentences, lower_chunk_size):
    """
    Look-ahead function to determine the next chunk
    """
    if sum([len(s) for s in sentences]) < lower_chunk_size:
        return sentences
    else:
        for i in range(len(sentences)):
            if sum([len(s) for s in sentences[:i+1]]) >= lower_chunk_size:
                return sentences[:i+1]
next_l = ["I'm really sorry that the quality is kind of a because I'm doing it on my phone because I'm a poor girl, but eventually I will get a official like microphone. ", "So hopefully it'll be better. ", 'Love you.  ']
look_ahead_chuck(next_l, 200)
print(sum([len(s) for s in next_l]))

200


In [187]:
np.argmin([len(get_transcription(metadata_train.iloc[i])) for i in range (20)])

19

In [181]:
SentenceTransformer("all-MiniLM-L6-v2")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [196]:
def look_ahead_chuck(sentences, lower_chunk_size):
    """
    Look-ahead function to determine the next chunk
    """
    if sum([len(s) for s in sentences]) < lower_chunk_size:
        return sentences
    else:
        for i in range(len(sentences)):
            if sum([len(s) for s in sentences[:i+1]]) >= lower_chunk_size:
                return sentences[:i+1]


def semantic_segmentation(text, model, lower_chunk_size=300, upper_chunk_size=3000):
    """
    Algorithm proposed by Moro et. al. (2022) to semantically segment long inputs into GPU memory-adaptable chunks.
    https://www.aaai.org/AAAI22Papers/AAAI-3882.MoroG.pdf

    Parameters
    -------------
    text: str
        The text to be segmented
    model: SentenceTransformer
        The model to be used for the sentence embeddings
    lower_chunk_size: int
        The lower bound of the chunk size
    upper_chunk_size: int
        The upper bound of the chunk size
    Return
    -------
    List of chunks of text
    """

    # segment the text into sentences
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(text)

    chuncks = []
    current_chunk = []

    # Iterate over the sentences in the text
    for i, sentence in enumerate(sentences):
        if i == 0:
            current_chunk.append(sentence)
        elif i == len(sentences) - 1:
            # If the sentence is the last one, we add it to the last chunk
            current_chunk.append(sentence)
            chuncks.append(current_chunk)
        elif sum([len(s) for s in current_chunk]) + len(sentence) < lower_chunk_size:
            current_chunk.append(sentence)
        elif sum([len(s) for s in current_chunk]) + len(sentence) > upper_chunk_size:
            chuncks.append(current_chunk)
            current_chunk = [sentence]
        else:
            next_chuck = look_ahead_chuck(sentences[i+1:], lower_chunk_size)
            
            # get the embedding of the previous chunk and the next chunk
            current_embedding = model.encode(current_chunk)
            next_embedding = model.encode(next_chuck)
            sentence_embedding = model.encode([sentence])

            # get the cosine similarity between the embedding of the embeddings
            score_current_chunk = util.cos_sim(sentence_embedding, current_embedding).numpy().mean()
            score_next_chunk = util.cos_sim(sentence_embedding, next_embedding).numpy().mean()

            print(f"{score_current_chunk} - vs - {score_next_chunk}")
            if score_current_chunk > score_next_chunk:
                current_chunk.append(sentence)
            else:
                chuncks.append(current_chunk)
                current_chunk = [sentence]
    return chuncks

# Initialize the sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
semantic_segmentation(get_transcription(metadata_train.iloc[19]), model)

Downloading:  29%|██▉       | 128M/438M [00:18<00:18, 17.1MB/s]

0.27102553844451904 - vs - 0.29411816596984863
0.08199809491634369 - vs - 0.20796453952789307
0.23852111399173737 - vs - 0.18379461765289307
0.24335750937461853 - vs - 0.39189639687538147
0.35674217343330383 - vs - 0.3150988221168518
0.3809787631034851 - vs - 0.2646925449371338
0.17474138736724854 - vs - 0.3235437870025635
0.37678059935569763 - vs - 0.46466735005378723
0.39269906282424927 - vs - 0.3137981593608856
0.5209943056106567 - vs - 0.28821972012519836
0.21534693241119385 - vs - 0.16390781104564667
0.10270268470048904 - vs - 0.1520935297012329


[["I'm Adela Langdon, welcome to impact radio the podcast to reconnect remind Inspire episode 1 ground. ",
  'What are the consequences of not being grounded? ',
  "You're not thinking clearly you're not present. ",
  "So, where are you you're on stage giving a presentation you're swaying from side to side you.  "],
 ["Walking around in circles never quite arriving, but you just can't seem to stop it's as if your body has a mind of its own then you forget the most important item in your presentation by now, you're desperate scrambling to find anything to fill in the deafening silence in short you are physically and mentally all over the place. "],
 ["You don't know where you are. ",
  "You're lost.  ",
  "When you feel you are under pressure, your body will always resort to what feels familiar AKA communication habits whether it's working or not. ",
  "You don't have the mental space to get out of this horrible spiral of presentation disasters your presence your connection with your au