In [33]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# simple tokeniser
def tokenize_split(text):
    return text.split()


def word_chunk_transcript(transcripts, chunk_size=500):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts.transcript.apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i} - {i+chunk_size}")
    chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return chunked_df


def sentence_chunk_transcript(transcripts, chunk_size=20):
    """ transcripts: must have two columns - episode_id and transcript
        chunk_size: number of sentences in chunk 
    """
    transcripts["sentence_token"] = transcripts.transcript.apply(lambda x: sent_tokenize(x, language='english'))
    episode_ls = []
    transcript_ls = []
    sent_enum_ls = []
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["sentence_token"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["sentence_token"][i:i+chunk_size]))
            sent_enum_ls.append(f"{i} - {i+chunk_size}")
    sentence_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'sentence_enumerated':sent_enum_ls})
    return sentence_chunked_df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oskarmunckafrosenschold/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
df = pd.read_csv("transcripts_sample.csv.gz", compression="gzip")[:10]
print(df.shape)
df.head(10)

(10, 14)


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,word_count
0,show_1jUyEaMpjfOYjcwPCdEaec,35xHkzb4wNPpqipwjAkmDI,Hello and welcome along to the property Academ...,0.820831,13516,The Property Academy Podcast,The Property Academy Podcast is a daily show t...,Opes Partners,['en'],The Additional Costs Associated with Airbnb | ...,"In this episode, we discuss the additional cos...",13.906433,show_1jUyEaMpjfOYjcwPCdEaec,2507
1,show_0u6NNu3ZyZHyn888FD3WdE,5jYwyaLp8PDnQondFv77kC,"Good morning, everyone. This is Trinity here a...",0.818924,25849,TheProdcast,The Prodcast - all about the stars behind stel...,TheProdcast,['en'],Episode 6: How tech meets travel—redefining va...,Travel—the very word is enough to instil energ...,25.466783,show_0u6NNu3ZyZHyn888FD3WdE,4834
2,show_6KLpvCAxrVzbsnBnRs8O4I,12UFlPPdjCBpFibZQnnwLe,"Hey guys, it's Peter fry and welcome to the li...",0.805737,4894,Living with Hope Podcast with Peter Frey,Welcome to the Living with Hope podcast with P...,Peter Frey,['en'],IMMEASURABLY MORE | Ephesians 3:14-21 | Living...,Paul's prayer for the Ephesians guides us toda...,6.436567,show_6KLpvCAxrVzbsnBnRs8O4I,998
3,show_1HvChDzJwUYPX4YU7JJ5Aj,3NfJNHjBIW6IMsg8gGN9Th,"Hey afterbuzzers, before we move on to your ne...",0.819311,20993,The Good Place After Show Podcast,If philosophical discussions on life and the a...,AfterBuzz TV,['en-US'],"""The Funeral to End All Funerals"" Season 4 Epi...",Good Janet and Bad Janet unite?! And the Judge...,23.0818,show_1HvChDzJwUYPX4YU7JJ5Aj,4080
4,show_6rUa8ruUHI2kl7DjyzxBdw,36uhfvspHI1lsjsdfJ0xlz,Have you ever wondered what it's like to be pr...,0.86039,640,30 and Pregnant,"One woman’s journey through pregnancy, week by...",Abby,['en'],30 and Pregnant (Trailer),,0.770133,show_6rUa8ruUHI2kl7DjyzxBdw,120
5,show_6SZVsPIxPfVs6aavqM1peY,1OyBBTESBsHAbHqqtbd0wD,What's up Podcast listeners? I'm Jason tifford...,0.851476,18171,The GaryVee Audio Experience,"Welcome to The Garyvee Audio Experience, hoste...",Gary Vaynerchuk,['en'],"Preparation, Competitiveness and Deep Passion ...","What's up podcast, shout out to the intern Jas...",17.411917,show_6SZVsPIxPfVs6aavqM1peY,3418
6,show_3brKj6QXXlrcxq1rOLW5MV,2xSEYp5KonJDAegxnirH5H,You have turned into functional fun. I'm Mike ...,0.806241,11150,Functional Fun with OT Students,A journey through the life of Occupational The...,Functional Fun with OTS,['en'],Applying to Cox College MSOT program!!,Where do I go to apply? What is the applicatio...,12.517483,show_3brKj6QXXlrcxq1rOLW5MV,2095
7,show_2yd8H3nK8IYcJQWtc8eKd3,4BST1YWmXy6QY4H0qwi0Wu,Welcome back to the Jordan side many podcast. ...,0.822296,53965,The Jordan Syatt Mini-Podcast,You and I are going to drink a lot of coffee t...,Jordan Syatt,['en-US'],Jackie's Story: 100lbs Down (Lessons Learned &...,In this episode of The Jordan Syatt Mini Podca...,59.2821,show_2yd8H3nK8IYcJQWtc8eKd3,10734
8,show_6sY90mdRx78qRAQKV8xEJe,4P8N2rFkxXzVv2OuMl7mCh,"Hey everyone, before we continue with the show...",0.839937,15955,Career Talk: Learn - Grow - Thrive,Host Stephanie Dennis is on a mission to empow...,Stephanie Dennis,['en'],Answering Your Questions,Here are the questions we are answering today:...,16.844533,show_6sY90mdRx78qRAQKV8xEJe,2919
9,show_5yNfaow2o5UCc3YUhho1pL,5Q4pTJLqfksYgsqEfcRqq8,Hello everyone. Welcome to all about Nigeria a...,0.843821,16570,All About Nigeria,This podcast aims to help listeners gain a bet...,All About Nigeria,['en'],1. Welcome to All About Nigeria,Introducing All About Nigeria with Mrs Iyabo A...,22.470333,show_5yNfaow2o5UCc3YUhho1pL,3249


In [34]:
# Testing
# b = chunk_transcript(df.loc[: ,["episode_id", "transcript"]])
# print(b.shape)
# b.head(40)

b = sentence_chunk_transcript(df.loc[: ,["episode_id", "transcript"]], chunk_size=1)
b.iloc[110:150]


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
110,35xHkzb4wNPpqipwjAkmDI,Our free property investment one is available ...,110 - 111
111,35xHkzb4wNPpqipwjAkmDI,You can access that at Opus first home dot cod...,111 - 112
112,35xHkzb4wNPpqipwjAkmDI,It really does help us get the message out to ...,112 - 113
113,35xHkzb4wNPpqipwjAkmDI,There's a lot here for our investors to take a...,113 - 114
114,35xHkzb4wNPpqipwjAkmDI,Absolutely.,114 - 115
115,35xHkzb4wNPpqipwjAkmDI,Thanks for listening to the property Academy p...,115 - 116
116,35xHkzb4wNPpqipwjAkmDI,I'm your host Steve McKnight.,116 - 117
117,35xHkzb4wNPpqipwjAkmDI,I'm John Laurie and we're going to be back aga...,117 - 118
118,5jYwyaLp8PDnQondFv77kC,"Good morning, everyone.",0 - 1
119,5jYwyaLp8PDnQondFv77kC,This is Trinity here and welcome to team forec...,1 - 2
