In [33]:
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize


def tokenize_split(text):
    """simple tokeniser"""
    return text.split()


def word_chunk_transcript(transcripts, name_variable='transcript', chunk_size=500):
    """ transcripts: must have two columns - episode_id and tokenised_transcript
        chunk_size: number of tokens in chunk 
    """
    transcripts["tokenised_transcript"] = transcripts[name_variable].apply(lambda x: tokenize_split(x))

    episode_ls = []
    transcript_ls = []
    words_enum_ls = [] 
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["tokenised_transcript"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["tokenised_transcript"][i:i+chunk_size]))
            words_enum_ls.append(f"{i+1} - {i+chunk_size}")
    word_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'words_enumerated':words_enum_ls})
    return word_chunked_df


def sentence_chunk_transcript(transcripts, name_variable='transcript', chunk_size=1):
    """ transcripts: must have two columns - episode_id and transcript
        chunk_size: number of sentences in chunk 
    """
    transcripts["sentence_token"] = transcripts[name_variable].apply(lambda x: sent_tokenize(x, language='english'))
    episode_ls = []
    transcript_ls = []
    sent_enum_ls = []
    for index, row in transcripts.iterrows():
        for i in range(0, len(row["sentence_token"]), chunk_size):
            episode_ls.append(row["episode_id"])
            transcript_ls.append(" ".join(row["sentence_token"][i:i+chunk_size]))
            sent_enum_ls.append(i+1)
    sentence_chunked_df = pd.DataFrame(data = {'episode_id': episode_ls, 'transcript_subset':transcript_ls, 'sentence_enumerated':sent_enum_ls})
    return sentence_chunked_df


### Make chunked dataset

In [3]:
transcripts_sample = pd.read_csv('sports_transcripts.csv.gz', compression='gzip')
print(transcripts_sample.shape)
# transcripts_sample.head(5)

(11821, 17)


In [4]:
# Extract columns for conversion
cols_subset = transcripts_sample.loc[: ,["episode_id", "transcript"]]

### Create new datasets

In [5]:
sports_256 = word_chunk_transcript(cols_subset, name_variable='transcript', chunk_size=256)

print(sports_256.shape)
sports_256.head(2)

(368835, 3)


Unnamed: 0,episode_id,transcript_subset,words_enumerated
0,41JbXYp7c2uuJoFB4TcQtD,Hello and welcome to the law review podcast. M...,0 - 256
1,41JbXYp7c2uuJoFB4TcQtD,"as well as several other changes, so we'll get...",256 - 512


In [6]:
sports_256.to_csv('sports_word_256.csv.gz', compression='gzip')

In [7]:
sport_sent_1 = sentence_chunk_transcript(cols_subset, name_variable='transcript', chunk_size=1)
print(sport_sent_1.shape)
sport_sent_1.head(2)

(5460190, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,41JbXYp7c2uuJoFB4TcQtD,Hello and welcome to the law review podcast.,0 - 1
1,41JbXYp7c2uuJoFB4TcQtD,My name is Nathan Church Droid by my partner c...,1 - 2


In [8]:
sport_sent_1 = sport_sent_1.sample(500000, random_state=42)
sport_sent_1.shape

(500000, 3)

In [9]:
sport_sent_1.to_csv('sport_sent_1.csv.gz', compression='gzip')

## Pipeline for segmentation

In [18]:
# load dataset
data = pd.read_csv('sports_transcripts.csv.gz', compression='gzip', nrows=5)
data_to_use = data.iloc[[1]].copy()
data_to_use.head()

Unnamed: 0.1,Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,category,pubdate,word_count
1,31,show_2UpbOw7HZDVpHaieaeswzj,1rv5FRQHZGm2VZbyj0QZtm,Hey guys are in take a quick break here to tal...,0.846479,59985,No Extra Points - An AAF Podumentary,When the Alliance of American Football debuted...,William Renken,['en'],No Extra Points - An AAF Podumentary,"In March 2018, Charlie Ebersol announced to th...",67.653667,show_2UpbOw7HZDVpHaieaeswzj,Sports,2019-06-17,11233


In [19]:
prediction_data = sentence_chunk_transcript(data_to_use, name_variable='transcript', chunk_size=1)
prediction_data.to_csv('prediction_data.csv')
print(prediction_data.shape)
prediction_data.head()

(468, 3)


Unnamed: 0,episode_id,transcript_subset,sentence_enumerated
0,1rv5FRQHZGm2VZbyj0QZtm,Hey guys are in take a quick break here to tal...,1 - 1
1,1rv5FRQHZGm2VZbyj0QZtm,What's holding you up?,2 - 2
2,1rv5FRQHZGm2VZbyj0QZtm,Because it's free that's always the biggest th...,3 - 3
3,1rv5FRQHZGm2VZbyj0QZtm,You need to make your podcast go.,4 - 4
4,1rv5FRQHZGm2VZbyj0QZtm,You don't have to worry about necessarily owni...,5 - 5


## Extract transcripts for annotation

In [78]:
sports_trans = pd.read_csv('sports_transcripts.csv.gz', compression='gzip', usecols=[2,3,16])
sports_trans = sports_trans.sample(200)
print(f'Average word count of annotated transcripts: {sports_trans.word_count.mean():.0f}')

Average word count of annotated transcripts: 8023


In [79]:
# Write transcripts to files
import os.path

for i in range(len(sports_trans.index)):
    save_path = '../Thesis/annotated_transcripts/'
    name_of_file = f'transcript{i+1}_{sports_trans.iloc[i,0]}'
    complete_path = os.path.join(save_path, name_of_file+".txt")

    text_file = open(complete_path, "w")
    text_file.write(sports_trans.iloc[i,1])
    text_file.close()

### Read back and output list 

In [None]:
import os
directory = '../Thesis/annotated_transcripts'

annotated_ls = []
episode_ls = []
for dirpath, dirnames, files in os.walk(directory):
    for file in files:
        episode_ls.append(file)
        path = os.path.join(dirpath, file)
        with open(path) as f: 
            lines = f.readlines()
            annotated_ls.append(''.join(lines))

annotated_df = pd.DataFrame({'transcript': annotated_ls, 'episode_id': episode_ls})

In [133]:
# temp_df = annotated_df[annotated_df.episode_id == 'transcript1_6preEOWrgR9eRr938upFgv.txt'].copy()
df_per_transcript = sentence_chunk_transcript(annotated_df)

In [132]:
unique_id = df_per_transcript.episode_id.unique()

annotation_values = pd.DataFrame(columns = ['episode_id', 'annotation'])

for id in unique_id:
    find_annotation = df_per_transcript[df_per_transcript.episode_id == id].copy()
    annotated_index_ls = []
    for row in find_annotation.itertuples():
        if row[2].startswith('@@'):
            annotated_index_ls.append(row[3])
    if len(annotated_index_ls) != 0:
        temp_df = pd.DataFrame({'episode_id': id, 'annotation':[annotated_index_ls]})
        annotation_values = pd.concat([annotation_values, temp_df], axis=0)

annotation_values.head()


Unnamed: 0,episode_id,annotation
0,transcript1_6preEOWrgR9eRr938upFgv.txt,"[4, 39, 71]"
0,transcript2_7mv5E2yb2yVQU34OiQ1vqv.txt,"[37, 67]"
