In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize


def tokenize_split(text):
    """simple tokeniser"""
    return text.split()


# Functions for chunking transcripts on either words or sentences

def word_chunk_transcript(transcripts, chunk_size=500):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts.transcript.apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i} - {i+chunk_size}")
    word_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return word_chunked_df


def sentence_chunk_transcript(transcripts, chunk_size=20):
    """ transcripts: must have two columns - episode_id and transcript
        chunk_size: number of sentences in chunk 
    """
    transcripts["sentence_token"] = transcripts.transcript.apply(lambda x: sent_tokenize(x, language='english'))
    episode_ls = []
    transcript_ls = []
    sent_enum_ls = []
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["sentence_token"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["sentence_token"][i:i+chunk_size]))
            sent_enum_ls.append(f"{i} - {i+chunk_size}")
    sentence_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'sentence_enumerated':sent_enum_ls})
    return sentence_chunked_df


### Make chunked dataset

In [2]:
transcripts_sample = pd.read_csv("transcripts_sample.csv.gz", compression="gzip")
print(transcripts_sample.shape)
# transcripts_sample.head(5)

(15000, 16)


In [3]:
# Extract columns for conversion
cols_subset = transcripts_sample.loc[: ,["episode_id", "transcript"]]

# Make dataset with chunked documents on 25 sentences per chunk
sentences_25 = sentence_chunk_transcript(cols_subset, chunk_size=25)
print(sentences_25.shape)
sentences_25.head(10)

(215064, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director of active chicks ...,0 - 25
1,7tYqM5F5SKtt7lFgcimgAh,"And when I say, you know, I'm making decisions...",25 - 50
2,7tYqM5F5SKtt7lFgcimgAh,What time do you get home who was around you? ...,50 - 75
3,7tYqM5F5SKtt7lFgcimgAh,And just try to use the process of elimination...,75 - 100
4,3gaoEuBYb51UoX7zeqv9yr,We recording KP now. We are recording guys pro...,0 - 25
5,3gaoEuBYb51UoX7zeqv9yr,So world's greatest agent is all about showcas...,25 - 50
6,3gaoEuBYb51UoX7zeqv9yr,Jerk. Well in Los Angeles. Thank you man. Of c...,50 - 75
7,3gaoEuBYb51UoX7zeqv9yr,I even ended up in Special Needs school and I ...,75 - 100
8,3gaoEuBYb51UoX7zeqv9yr,We ended up having an hour meeting and we just...,100 - 125
9,3gaoEuBYb51UoX7zeqv9yr,What was what do you think he saw in you that ...,125 - 150


In [4]:
# Check mean word count of chunks
sentences_25["word_count"] = sentences_25.transcript_subset.apply(lambda x: len(x.split(" ")))
sentences_25.word_count.mean()

400.48517650559836

In [5]:
# Make new dataset
sentences_25.to_csv("sentences_chunkssize_25.csv.gz", compression="gzip")

In [6]:
len(sentences_25[sentences_25.word_count == 1])

74