In [2]:
import pandas as pd

In [3]:
transcriptsv1 = pd.read_csv("transcripts_dataset_v1.csv.gz", compression="gzip")
transcriptsv2 = pd.read_csv("transcripts_dataset_v2.csv.gz", compression="gzip")
metadata = pd.read_csv("metadata.tsv", sep= "\t")

In [7]:
# print('Transcipt v1 shape:', transcriptsv1.shape)
# print('Transcipt v2 shape:', transcriptsv2.shape)
print('Metadata shape:', metadata.shape, '\n')

# print(list(transcriptsv1.columns))
# print(list(transcriptsv2.columns))
print(list(metadata.columns))

# print('\n')

# print('Transcript v1 null check:\n', transcriptsv1.isna().sum(), '\n')
# print('Transcript v2 null check:\n', transcriptsv2.isna().sum(), '\n')
print('Metadata null check:\n', metadata.isna().sum()) # show_name 2 nulls, episode_description 205 nulls


Metadata shape: (105360, 9) 

['show_name', 'show_description', 'publisher', 'language', 'episode_name', 'episode_description', 'duration', 'show_id', 'episode_id']
Metadata null check:
 show_name                0
show_description         2
publisher                0
language                 0
episode_name             0
episode_description    205
duration                 0
show_id                  0
episode_id               0
dtype: int64


In [5]:
# concatenate transcripts dataframes
transcripts = pd.concat([transcriptsv1, transcriptsv2])
print(transcripts.shape)

# subset metadata
metadata = metadata[["show_name", "show_description", "publisher", "language", "episode_name", "episode_description", "duration", "show_filename_prefix", "episode_filename_prefix"]]
metadata = metadata.rename({"episode_filename_prefix": "episode_id", "show_filename_prefix": "show_id"}, axis="columns")  # rename cols

# remove episode_id suffix 
transcripts.episode_id = transcripts.episode_id.apply(lambda x: x.rstrip(".json"))
transcripts = transcripts.drop("Unnamed: 0", axis=1)

(105360, 6)


In [6]:
metadata.nunique()

show_name               18290
show_description        18321
publisher               17490
language                   20
episode_name           103660
episode_description    100878
duration                90916
show_id                 18376
episode_id             105360
dtype: int64

In [8]:
# join data
full_dataset = transcripts.join(metadata.set_index("episode_id"), on="episode_id", rsuffix="_trans")
full_dataset.isna().sum()

# full_dataset.head(5)

Unnamed: 0,show_id,episode_id,transcript,avg_confidence,word_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans
0,show_2uE1HD7Mhar1BXrtbDTYXr,2MlANNCG8ByQl3yKo8YV33,"Hey guys, so like I'm in the middle of editing...",0.862924,20912,The Get Connected Podcast,"Getting Connected on all things Media, Mental ...",Morty,['en-AU'],Am I Liked?,"Am I liked? Well, living a life on social med...",29.238567,show_2uE1HD7Mhar1BXrtbDTYXr
1,show_2uz3xaiifukqKpvLukWcJI,41JbXYp7c2uuJoFB4TcQtD,Hello and welcome to the law review podcast. M...,0.846363,31611,The Lower View,The Lower View aims to provide an informed tak...,Nate Schertz,['en'],"MLB Sign Stealing, Bears Coaching, and Super B...",Welcome into The Lower View Podcast! In our fi...,32.394817,show_2uz3xaiifukqKpvLukWcJI
2,show_2uECdgbgaRvpmToGn0En9T,6aF8uQQZvPgYiEEGoW3JDt,"Hey guys, what's going on have a chat with Mat...",0.80206,5635,Chat With Matt,Have a chat with Matt. Matt’s a funny guy and ...,Matthew Manca,['en'],Adjust your mindset - SEASON 1 - EPISODE 2,What are you focusing on? I know there’s a mil...,6.366517,show_2uECdgbgaRvpmToGn0En9T
3,show_2uECdgbgaRvpmToGn0En9T,1q33HrnjW5R2vxT2lpIMrB,Have a chat with Matt guys. Welcome to series ...,0.840966,2370,Chat With Matt,Have a chat with Matt. Matt’s a funny guy and ...,Matthew Manca,['en'],The Intro - Have A Chat With Matt,Sydney based personal trainer started running ...,2.962867,show_2uECdgbgaRvpmToGn0En9T
4,show_2UcEmfC2NsNA3Eqk0uTa81,6vJuwoSXxdYYoWTmGz2URL,Hello and welcome to shrink Matt a podcast cre...,0.820963,29349,Shrink Rapt,Shrink Rapt is a fortnightly podcast created s...,Thalamos,['en'],Episode 1 - Dr Lesley Haines - PIPSIG Chair,Dr Lesley Haines is a Consultant Psychiatrist ...,31.80825,show_2UcEmfC2NsNA3Eqk0uTa81


In [9]:
# Null inspection
full_dataset[full_dataset.isnull().any(axis=1)] # metadata discrepancies, episode_id present for all instances


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,word_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans
14,show_2UyVUlVZBrZxOPcNeHO59D,0YzMxGglOuWHsxDMLxKox,Hi and welcome to combos with Chloe. So today ...,0.824201,24646,,,,,,,,
30,show_2ULvmJxH0wTNX5CaQJmUyV,42haWkOrz8KxEx2CTalUH6,So I had welcome to the intro to co-hosted was...,0.865214,553,Co-hosted,Just a couple of silly beatches,Had and Todd,['en'],Co-hosted (Trailer),,0.7198,show_2ULvmJxH0wTNX5CaQJmUyV
44,show_2upl2yQZNxJugMYhjLinCY,7iVr2CPSrwhoiv8kGbKPc,"Yes, a recording. Okay, do-do do-do-do-do-do d...",0.853137,737,,,,,,,,
53,show_2UfNx2S4okbvJT8qffy74X,6OTFO8XFjyiUns9cnDcqA,You're tuning in to the badass business podcas...,0.857091,50770,,,,,,,,
63,show_2UfNx2S4okbvJT8qffy74X,5lTPEWcNPtI6LuXwYttQQ,You guys it's my birthday today. Can you belie...,0.841047,17257,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65421,show_6OiCYiv2wC3f4Md3yl5TCR,6Rh4kq2ww5LuTid2eIJN6,If you're listening to this you obviously like...,0.808037,42263,,,,,,,,
65427,show_6OiCYiv2wC3f4Md3yl5TCR,6JPHJPrxOeuAlXTUUGCUu,Inside the birds is back. What's going on? Eve...,0.826048,35175,,,,,,,,
65430,show_6OiCYiv2wC3f4Md3yl5TCR,2UXSLrzvyyVjaERnZEGe1,Inside the birds is back and we are back a lit...,0.827634,41261,,,,,,,,
65442,show_6OTHlO3TMwKLLQL1QZunKh,6njm7CisnRzDIm5iZdxBl,This is the L2 Capital podcast with hedge fund...,0.800108,22741,,,,,,,,


In [10]:
# Save the DataFrame as a gzip-compressed CSV file
full_dataset.to_csv('transcripts_dataset_final.csv.gz', compression='gzip', index=False)

In [None]:
# # Impute missing values
# transcripts.at[39887, "episode_id"] = "3EAicvUXZULEPKfcj6zn.json"
# transcripts.at[39888, "episode_id"] = "4LRKXIzS3cPFOOaSIsuQsB.json"
# transcripts.at[39889, "episode_id"] = "3qc2QKVtz3qLYwnliohJwW.json"
# transcripts.at[39890, "episode_id"] = "1ZaDV9eQKQSI9a20Cxnd9s.json"
# transcripts.at[39891, "episode_id"] = "2QwTaS7Rqcq891jZ3lilFB.json"

# # not needed