In [2]:
import pandas as pd
import numpy as np

In [3]:
transcriptsv1 = pd.read_csv("transcripts_dataset_v1.csv.gz", compression="gzip")
transcriptsv2 = pd.read_csv("transcripts_dataset_v2.csv.gz", compression="gzip")
metadata = pd.read_csv("metadata.csv.gz", compression='gzip')

In [7]:
print('Transcipt v1 shape:', transcriptsv1.shape)
print('Transcipt v2 shape:', transcriptsv2.shape)
print('Metadata shape:', metadata.shape, '\n')

# print(list(transcriptsv1.columns))
# print(list(transcriptsv2.columns))
# print(list(metadata.columns))

# print('\n')

# print('Transcript v1 null check:\n', transcriptsv1.isna().sum(), '\n')
# print('Transcript v2 null check:\n', transcriptsv2.isna().sum(), '\n')
print('Metadata null check:\n', metadata.isna().sum()) # show_name 2 nulls, episode_description 205 nulls, category 6000, pubdate 20030


Transcipt v1 shape: (39892, 6)
Transcipt v2 shape: (65468, 6)
Metadata shape: (105360, 14) 

Metadata null check:
 show_uri                       0
show_name                      0
show_description               2
publisher                      0
language                       0
rss_link                       0
episode_uri                    0
episode_name                   0
episode_description          205
duration                       0
show_filename_prefix           0
episode_filename_prefix        0
category                    6000
pubdate                    20030
dtype: int64


In [8]:
# concatenate transcripts dataframes
transcripts = pd.concat([transcriptsv1, transcriptsv2])
print(transcripts.shape)

# subset metadata
metadata = metadata[["show_name", "show_description", "publisher", "language", "episode_name", "episode_description", "duration", "show_filename_prefix", "episode_filename_prefix", 'category', 'pubdate']]
metadata = metadata.rename({"episode_filename_prefix": "episode_id", "show_filename_prefix": "show_id"}, axis="columns")  # rename cols

# remove episode_id suffix 
transcripts.episode_id = transcripts.episode_id.apply(lambda x: x.replace(".json", ""))
transcripts = transcripts.drop("Unnamed: 0", axis=1)

# remove whitespace
metadata.episode_id = metadata.episode_id.apply(lambda x: x.strip())

(105360, 6)


In [9]:
# join data
full_dataset = transcripts.join(metadata.set_index("episode_id"), on="episode_id", rsuffix="_trans")
full_dataset.isna().sum()

show_id                    0
episode_id                 0
transcript                 0
avg_confidence             0
word_count                 0
show_name                  0
show_description           2
publisher                  0
language                   0
episode_name               0
episode_description      205
duration                   0
show_id_trans              0
category                6000
pubdate                20030
dtype: int64

In [10]:
print(full_dataset.shape)
# full_dataset.head(5)

(105360, 15)


In [11]:
# Null inspection
full_dataset[full_dataset.isnull().any(axis=1)] # metadata discrepancies, episode_id present for all instances


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,word_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,category,pubdate
5,show_2uf4dRlrtFgANIXtmsNS7H,22Hee4IumtEZ3PiHp21m8V,Hayward we did it step one done. Hello. Welcom...,0.834999,17174,The Reng,The funniest thing you’ve ever heard- from ver...,Alex Rengers,['en'],The Reng Podcast |Ep. 1|,"First ever podcast by you'rs truly, Alex Renge...",24.119733,show_2uf4dRlrtFgANIXtmsNS7H,,
7,show_2UyVUlVZBrZxOPcNeHO59D,3uZtH30AGm11sJEsBEZNqm,Hello and welcome to combos with Chloe. Today....,0.829958,15372,"Convos With Chloe: God, Faith, Dating & Relati...",Why are relationships so hard? Why do I worry ...,Chloe M. Gooden,['en'],When It Feels Like God Isn't There,Have you ever felt like God wasn't with you in...,15.790733,show_2UyVUlVZBrZxOPcNeHO59D,Religion & Spirituality,
8,show_2UyVUlVZBrZxOPcNeHO59D,7yME8WJllFMuj7rbBtFc5t,Hello and welcome to combos with Chloe. So tod...,0.823783,23793,"Convos With Chloe: God, Faith, Dating & Relati...",Why are relationships so hard? Why do I worry ...,Chloe M. Gooden,['en'],"When He Says ""He Isn't Ready for a Relationshi...",Ugh! That frustrating sentence that we all hat...,22.637900,show_2UyVUlVZBrZxOPcNeHO59D,Religion & Spirituality,
9,show_2UyVUlVZBrZxOPcNeHO59D,0jSpyAIsU7R8BQ5rP5EUgy,Hello and welcome to combos with Chloe. So tod...,0.824384,21046,"Convos With Chloe: God, Faith, Dating & Relati...",Why are relationships so hard? Why do I worry ...,Chloe M. Gooden,['en'],Why We Go Back to Guys We Know Aren't Good For Us,How many times have you gone back to an ex tha...,20.473417,show_2UyVUlVZBrZxOPcNeHO59D,Religion & Spirituality,
11,show_2UyVUlVZBrZxOPcNeHO59D,2pQLJKsfwaUtduKbRl1TrW,Hello and welcome to combos with Chloe. So tod...,0.820536,16028,"Convos With Chloe: God, Faith, Dating & Relati...",Why are relationships so hard? Why do I worry ...,Chloe M. Gooden,['en'],How to Be at Peace with an Unknown Future,I hate not knowing what will happen next in my...,15.770217,show_2UyVUlVZBrZxOPcNeHO59D,Religion & Spirituality,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65346,show_6Ov00yhKCAgR6BLomUF6yY,2fqISbi9SIiBeMTOLf1Rgr,Hello guys and welcome to the Frozen moat podc...,0.809614,27365,Frozen Milk,Welcome to the Frozen Milk podcast! Here we ta...,Cody Larsen,['en'],Doobs Talk Pt. 1,Me and some beautiful goons named Cameron Chri...,29.850800,show_6Ov00yhKCAgR6BLomUF6yY,,
65450,show_6OUisS4N6LZEwpglRdgu4w,6P0Ydyh5JWPPrbUkXS4kFo,Welcome to the sparkling podcast where you wil...,0.836591,18616,Sparklink Podcast,Click play to listen to talented young adults ...,Ashna Reddy,['en'],Episode #1: Improving Memory w/ Humm,Ever wondered if technology could improve our ...,16.199400,show_6OUisS4N6LZEwpglRdgu4w,Education,
65454,show_6o5SbgWOjYKdmButHAQ0Q6,5H3cwiOfAbchuvC1ZRcOPS,"Hello, good day. Welcome to my podcast on this...",0.771048,1440,Charto Podcast,"Well, I’m here to inform you that today is not...",Sixvalo,['en'],Sp 2 Complete Guide to Advertising on Instagram,Instagram advertising is method of paying to p...,2.236467,show_6o5SbgWOjYKdmButHAQ0Q6,,
65455,show_6o5SbgWOjYKdmButHAQ0Q6,0oTaZBWOdMXw2JOrNHwlgg,"Good morning, welcome to my podcast. Happy Sun...",0.823880,2712,Charto Podcast,"Well, I’m here to inform you that today is not...",Sixvalo,['en'],Sp 1 Complete Guide to Advertising on Instagram,So you’re running paid search and display ads ...,4.798783,show_6o5SbgWOjYKdmButHAQ0Q6,,


### Correct mistake in word count from directory walk

In [12]:
# word_count was actually char count 
full_dataset = full_dataset.rename(columns={"word_count": "char_count"})

# count words
full_dataset["word_count"] = full_dataset.transcript.apply(lambda x: len(x.split(" ")))

In [13]:
# Save the DataFrame as a gzip-compressed CSV file
full_dataset.to_csv('transcripts_dataset_final.csv.gz', compression='gzip', index=False)

In [1]:
import pandas as pd
sample_df = pd.read_csv('transcripts_sample.csv.gz', compression='gzip')
print(sample_df.shape)
sample_df.head()

(15000, 14)


Unnamed: 0,show_id,episode_id,transcript,avg_confidence,char_count,show_name,show_description,publisher,language,episode_name,episode_description,duration,show_id_trans,word_count
0,show_1jUyEaMpjfOYjcwPCdEaec,35xHkzb4wNPpqipwjAkmDI,Hello and welcome along to the property Academ...,0.820831,13516,The Property Academy Podcast,The Property Academy Podcast is a daily show t...,Opes Partners,['en'],The Additional Costs Associated with Airbnb | ...,"In this episode, we discuss the additional cos...",13.906433,show_1jUyEaMpjfOYjcwPCdEaec,2507
1,show_0u6NNu3ZyZHyn888FD3WdE,5jYwyaLp8PDnQondFv77kC,"Good morning, everyone. This is Trinity here a...",0.818924,25849,TheProdcast,The Prodcast - all about the stars behind stel...,TheProdcast,['en'],Episode 6: How tech meets travel—redefining va...,Travel—the very word is enough to instil energ...,25.466783,show_0u6NNu3ZyZHyn888FD3WdE,4834
2,show_6KLpvCAxrVzbsnBnRs8O4I,12UFlPPdjCBpFibZQnnwLe,"Hey guys, it's Peter fry and welcome to the li...",0.805737,4894,Living with Hope Podcast with Peter Frey,Welcome to the Living with Hope podcast with P...,Peter Frey,['en'],IMMEASURABLY MORE | Ephesians 3:14-21 | Living...,Paul's prayer for the Ephesians guides us toda...,6.436567,show_6KLpvCAxrVzbsnBnRs8O4I,998
3,show_1HvChDzJwUYPX4YU7JJ5Aj,3NfJNHjBIW6IMsg8gGN9Th,"Hey afterbuzzers, before we move on to your ne...",0.819311,20993,The Good Place After Show Podcast,If philosophical discussions on life and the a...,AfterBuzz TV,['en-US'],"""The Funeral to End All Funerals"" Season 4 Epi...",Good Janet and Bad Janet unite?! And the Judge...,23.0818,show_1HvChDzJwUYPX4YU7JJ5Aj,4080
4,show_6rUa8ruUHI2kl7DjyzxBdw,36uhfvspHI1lsjsdfJ0xlz,Have you ever wondered what it's like to be pr...,0.86039,640,30 and Pregnant,"One woman’s journey through pregnancy, week by...",Abby,['en'],30 and Pregnant (Trailer),,0.770133,show_6rUa8ruUHI2kl7DjyzxBdw,120


In [20]:
languages_not_keep = ["['nl-BE']","['hi']","['id']","['ml']","['ms']","['pt']","['en-JM']","['en-IN']","['ga']","['ta']","['es']"]
languages = list(set(sample_df.language.unique()) - set(languages_not_keep))

In [21]:
sample_df = sample_df[sample_df['language'].isin(languages)]
print(sample_df.shape)
sample_df.language.value_counts()

(14977, 14)


['en']       11326
['en-US']     2909
['en-GB']      341
['en-AU']      173
['en-CA']      139
['en-PH']       43
['en-IE']       26
['en-NZ']       12
['en-ZA']        8
Name: language, dtype: int64