In [6]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize


def tokenize_split(text):
    """simple tokeniser"""
    return text.split()


# Functions for chunking transcripts on either words or sentences

def word_chunk_transcript(transcripts, chunk_size=500):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts.transcript.apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i} - {i+chunk_size}")
    word_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return word_chunked_df


def sentence_chunk_transcript(transcripts, chunk_size=20):
    """ transcripts: must have two columns - episode_id and transcript
        chunk_size: number of sentences in chunk 
    """
    transcripts["sentence_token"] = transcripts.transcript.apply(lambda x: sent_tokenize(x, language='english'))
    episode_ls = []
    transcript_ls = []
    sent_enum_ls = []
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["sentence_token"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["sentence_token"][i:i+chunk_size]))
            sent_enum_ls.append(f"{i} - {i+chunk_size}")
    sentence_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'sentence_enumerated':sent_enum_ls})
    return sentence_chunked_df



### Make chunked dataset

In [7]:
transcripts_sample = pd.read_csv("transcripts_sample.csv.gz", compression="gzip")
print(transcripts_sample.shape)
transcripts_sample.head(5)

(15000, 14)


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,word_count
0,show_1jUyEaMpjfOYjcwPCdEaec,35xHkzb4wNPpqipwjAkmDI,Hello and welcome along to the property Academ...,0.820831,13516,The Property Academy Podcast,The Property Academy Podcast is a daily show t...,Opes Partners,['en'],The Additional Costs Associated with Airbnb | ...,"In this episode, we discuss the additional cos...",13.906433,show_1jUyEaMpjfOYjcwPCdEaec,2507
1,show_0u6NNu3ZyZHyn888FD3WdE,5jYwyaLp8PDnQondFv77kC,"Good morning, everyone. This is Trinity here a...",0.818924,25849,TheProdcast,The Prodcast - all about the stars behind stel...,TheProdcast,['en'],Episode 6: How tech meets travel—redefining va...,Travel—the very word is enough to instil energ...,25.466783,show_0u6NNu3ZyZHyn888FD3WdE,4834
2,show_6KLpvCAxrVzbsnBnRs8O4I,12UFlPPdjCBpFibZQnnwLe,"Hey guys, it's Peter fry and welcome to the li...",0.805737,4894,Living with Hope Podcast with Peter Frey,Welcome to the Living with Hope podcast with P...,Peter Frey,['en'],IMMEASURABLY MORE | Ephesians 3:14-21 | Living...,Paul's prayer for the Ephesians guides us toda...,6.436567,show_6KLpvCAxrVzbsnBnRs8O4I,998
3,show_1HvChDzJwUYPX4YU7JJ5Aj,3NfJNHjBIW6IMsg8gGN9Th,"Hey afterbuzzers, before we move on to your ne...",0.819311,20993,The Good Place After Show Podcast,If philosophical discussions on life and the a...,AfterBuzz TV,['en-US'],"""The Funeral to End All Funerals"" Season 4 Epi...",Good Janet and Bad Janet unite?! And the Judge...,23.0818,show_1HvChDzJwUYPX4YU7JJ5Aj,4080
4,show_6rUa8ruUHI2kl7DjyzxBdw,36uhfvspHI1lsjsdfJ0xlz,Have you ever wondered what it's like to be pr...,0.86039,640,30 and Pregnant,"One woman’s journey through pregnancy, week by...",Abby,['en'],30 and Pregnant (Trailer),,0.770133,show_6rUa8ruUHI2kl7DjyzxBdw,120


In [9]:
# Extract columns for conversion
cols_subset = transcripts_sample.loc[: ,["episode_id", "transcript"]]


# Make dataset with chunked documents on 25 sentences per chunk
sentences_25 = sentence_chunk_transcript(cols_subset, chunk_size=25)
print(sentences_25.shape)
sentences_25.head(10)


(218118, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,35xHkzb4wNPpqipwjAkmDI,Hello and welcome along to the property Academ...,0 - 25
1,35xHkzb4wNPpqipwjAkmDI,That's absolutely right. So other costs would ...,25 - 50
2,35xHkzb4wNPpqipwjAkmDI,And now we insist that they take it it's very ...,50 - 75
3,35xHkzb4wNPpqipwjAkmDI,So you could be around sort of four to four an...,75 - 100
4,35xHkzb4wNPpqipwjAkmDI,That's Urban Butler Dot. Column. So by the tim...,100 - 125
5,5jYwyaLp8PDnQondFv77kC,"Good morning, everyone. This is Trinity here a...",0 - 25
6,5jYwyaLp8PDnQondFv77kC,"We were 88 people, right and April we turn hun...",25 - 50
7,5jYwyaLp8PDnQondFv77kC,So the all the five-year Clauses for Airlines ...,50 - 75
8,5jYwyaLp8PDnQondFv77kC,Right for the next five to seven year growth s...,75 - 100
9,5jYwyaLp8PDnQondFv77kC,How big is the market right travel by default?...,100 - 125


In [10]:
# Make new dataset
sentences_25.to_csv("sentences_chunkssize_25.csv.gz", compression="gzip")