In [62]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
import spacy 

sp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
all_stopwords = sp.Defaults.stop_words

def tokenize_split(text):
    """simple tokeniser"""
    return text.split()


# Functions for chunking transcripts on either words or sentences

def word_chunk_transcript(transcripts, name_variable='transcript', chunk_size=500):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts[name_variable].apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i} - {i+chunk_size}")
    word_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return word_chunked_df


def sentence_chunk_transcript(transcripts, name_variable='transcript', chunk_size=20):
    """ transcripts: must have two columns - episode_id and transcript
        chunk_size: number of sentences in chunk 
    """
    transcripts["sentence_token"] = transcripts[name_variable].apply(lambda x: sent_tokenize(x, language='english'))
    episode_ls = []
    transcript_ls = []
    sent_enum_ls = []
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["sentence_token"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["sentence_token"][i:i+chunk_size]))
            sent_enum_ls.append(f"{i} - {i+chunk_size}")
    sentence_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'sentence_enumerated':sent_enum_ls})
    return sentence_chunked_df

# Define a function to remove stopwords from a string
def remove_stopwords(text):    
    doc = text.split(' ')
    return ' '.join([word for word in doc if word not in all_stopwords])

### Make chunked dataset

In [None]:
transcripts_sample = pd.read_csv("transcripts_sample.csv.gz", compression="gzip")
print(transcripts_sample.shape)
# transcripts_sample.head(5)

In [None]:
# Extract columns for conversion
cols_subset = transcripts_sample.loc[: ,["episode_id", "transcript"]]

### Remove stopwords

In [None]:
# apply the remove_stopwords function to the text column
cols_subset['transcript_wo_sw'] = cols_subset['transcript'].apply(remove_stopwords)
cols_subset.head()

### Create new datasets

In [60]:
word_transcript_256 = word_chunk_transcript(cols_subset, name_variable='transcript_wo_sw', chunk_size=256)

print(word_transcript_256.shape)
word_transcript_256.head(2)

(178234, 3)


Unnamed: 0,episode_id,transcript_subset,words_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director active chicks Fit...,0 - 256
1,7tYqM5F5SKtt7lFgcimgAh,So. Basically I absolutely rough. But taking h...,256 - 512


In [61]:
word_transcript_256.to_csv('word_transcript_256.csv.gz', compression='gzip')

In [63]:
sent_transcript_1 = sentence_chunk_transcript(cols_subset, name_variable='transcript_wo_sw', chunk_size=1)
print(sent_transcript_1.shape)
sent_transcript_1.head(2)

(5197100, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,7tYqM5F5SKtt7lFgcimgAh,I'm Daniel Williams director active chicks Fit...,0 - 1
1,7tYqM5F5SKtt7lFgcimgAh,I'm mom wonderful crazy kids wife handsome hus...,1 - 2


In [64]:
sent_transcript_1.to_csv('sent_transcript_1.csv.gz', compression='gzip')