In [1]:
import pandas as pd
import numpy as np

In [2]:
transcriptsv1 = pd.read_csv("transcripts_dataset_v1.csv.gz", compression="gzip")
transcriptsv2 = pd.read_csv("transcripts_dataset_v2.csv.gz", compression="gzip")
metadata = pd.read_csv("metadata.tsv", sep= "\t")

In [4]:
# print('Transcipt v1 shape:', transcriptsv1.shape)
# print('Transcipt v2 shape:', transcriptsv2.shape)
# print('Metadata shape:', metadata.shape, '\n')

# print(list(transcriptsv1.columns))
# print(list(transcriptsv2.columns))
# print(list(metadata.columns))

# print('\n')

# print('Transcript v1 null check:\n', transcriptsv1.isna().sum(), '\n')
# print('Transcript v2 null check:\n', transcriptsv2.isna().sum(), '\n')
print('Metadata null check:\n', metadata.isna().sum()) # show_name 2 nulls, episode_description 205 nulls


Metadata null check:
 show_uri                     0
show_name                    0
show_description             2
publisher                    0
language                     0
rss_link                     0
episode_uri                  0
episode_name                 0
episode_description        205
duration                     0
show_filename_prefix         0
episode_filename_prefix      0
dtype: int64


In [5]:
# concatenate transcripts dataframes
transcripts = pd.concat([transcriptsv1, transcriptsv2])
print(transcripts.shape)

# subset metadata
metadata = metadata[["show_name", "show_description", "publisher", "language", "episode_name", "episode_description", "duration", "show_filename_prefix", "episode_filename_prefix"]]
metadata = metadata.rename({"episode_filename_prefix": "episode_id", "show_filename_prefix": "show_id"}, axis="columns")  # rename cols

# remove episode_id suffix 
transcripts.episode_id = transcripts.episode_id.apply(lambda x: x.replace(".json", ""))
transcripts = transcripts.drop("Unnamed: 0", axis=1)

# remove whitespace
metadata.episode_id = metadata.episode_id.apply(lambda x: x.strip())

(105360, 6)


In [9]:
# join data
full_dataset = transcripts.join(metadata.set_index("episode_id"), on="episode_id", rsuffix="_trans")
full_dataset.isna().sum()

show_id                  0
episode_id               0
transcript               0
avg_confidence           0
word_count               0
show_name                0
show_description         2
publisher                0
language                 0
episode_name             0
episode_description    205
duration                 0
show_id_trans            0
dtype: int64

In [10]:
print(full_dataset.shape)
# full_dataset.head(5)

(105360, 13)


In [11]:
# Null inspection
full_dataset[full_dataset.isnull().any(axis=1)] # metadata discrepancies, episode_id present for all instances


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,word_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans
30,show_2ULvmJxH0wTNX5CaQJmUyV,42haWkOrz8KxEx2CTalUH6,So I had welcome to the intro to co-hosted was...,0.865214,553,Co-hosted,Just a couple of silly beatches,Had and Todd,['en'],Co-hosted (Trailer),,0.719800,show_2ULvmJxH0wTNX5CaQJmUyV
2371,show_2fzPBSBA3du5UmaQJyC9X3,2fX8reAas9uyOBiRQXqrLk,Murder mysteries clicky Mean Girls in every ad...,0.836465,424,we peaked in middle school,"a gossipy gab sesh with me, sarah critchfield,...",Sarah Critchfield,['en'],we peaked in middle school (Trailer),,0.521667,show_2fzPBSBA3du5UmaQJyC9X3
2873,show_2lKMza7YzMlZwwtpKAxhEG,27btNgfFE6wQ0Vm0hgfMGy,"Good morning, guys, welcome to my podcast. My ...",0.838601,555,4 Chiacchiere In Italian,"Learn the italian language, and why not, fall ...",Chiara Beletti,['en'],4 Chiacchiere In Italian (Trailer),,0.568883,show_2lKMza7YzMlZwwtpKAxhEG
2920,show_2lkvIKyO02biZmvyD6aoXE,0PKQrWflenISdCCqCoq7G0,And welcome to IGCSE success podcast. This is ...,0.849888,489,IGCSE English Podcast,This podcast is for IGCSE students where they ...,Ashish Joseph,['en'],IGCSE English Podcast (Trailer),,0.904800,show_2lkvIKyO02biZmvyD6aoXE
3136,show_292bqsU54qjpmBLcgFzryi,60lYlwU21MnJoFE5WfGUgg,"Hey everybody, it's your girl Charmy Daniels a...",0.823626,768,Tomi Talks,A podcast hosted by Tomi Daniels. Talking abou...,Tomi Daniels,['en'],Tomi Talks (Trailer),,0.914867,show_292bqsU54qjpmBLcgFzryi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60799,show_6qaQLSBnxafZ1AfDy45hWs,5HErT257TuR8fHhNQgynZ5,"Hey, this is Katie draft corn and you're liste...",0.739846,696,Rough Draff,a constant work in progress,Katie Draffkorn,['en'],Rough Draff (Trailer),,0.803417,show_6qaQLSBnxafZ1AfDy45hWs
62117,show_6rUa8ruUHI2kl7DjyzxBdw,36uhfvspHI1lsjsdfJ0xlz,Have you ever wondered what it's like to be pr...,0.860390,640,30 and Pregnant,"One woman’s journey through pregnancy, week by...",Abby,['en'],30 and Pregnant (Trailer),,0.770133,show_6rUa8ruUHI2kl7DjyzxBdw
62627,show_67BODpnvXMZG9eQGI73QEN,6Bwsu6SUBv6X1THWW6GAAS,Doggy style or fixation bondage sexual prefere...,0.842189,502,"Sex,Sex&more Sex",Welcome,Lucky Libra Lady,['en'],"Sex,Sex &more Sex talk",,0.698917,show_67BODpnvXMZG9eQGI73QEN
62996,show_6iE6GxXzkcF4JRnzk8jZy7,0gbP8kTwl9g3hDQwgfiQqR,Hey guys. I'm the OE here. Thank you so much f...,0.839478,7914,Em the yogi,Sharing my experiences with yoga and travel am...,Em the yogi,['en-US'],This is just the beginning,,10.467167,show_6iE6GxXzkcF4JRnzk8jZy7


### Correct mistake in word count from directory walk

In [None]:
# word_count was actually char count 
full_dataset = full_dataset.rename(columns={"word_count": "char_count"})

# count words
full_dataset["word_count"] = full_dataset.transcript.apply(lambda x: len(x.split(" ")))

In [12]:
# Save the DataFrame as a gzip-compressed CSV file
full_dataset.to_csv('transcripts_dataset_final.csv.gz', compression='gzip', index=False)

### Function for making on doc into smaller docs

In [1]:
columns = ["episode_id", "transcript_subset", "words_enum"]
episode_ls = []

def chunk_transcritp(transcript, chunk_size=200):
    transcript_ls = []
    words_enum_ls = []
    ls = transcript.split(" ")
    print(len(ls), ls)
    for i in range(0, len(ls), chunk_size):
        transcript_ls.append(" ".join(ls[i:i+chunk_size]))
        words_enum_ls.append(f"{i} - {i+chunk_size}")
    return (transcript_ls, words_enum_ls)