In [53]:
import os
from slugify import slugify

def retrive_file_name(file_name):
    # get only the file name
    return file_name.split("\\")[-1].split("/")[-1].split(".")[0]

def get_slugify(file_name):
    # clean up the file name and also replace w -> with
    new_file_name = slugify(retrive_file_name(file_name))#.replace("-w-", "-with-")
    # get the episode number
    ep_number = new_file_name.split("-")[2]
    return new_file_name, ep_number

def create_output_path(output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    return output_path

create_output_path("episodes")

def get_clean_episode_number(title):
    episode_number = slugify(title.split(" ")[2])
    if(episode_number=="forward"): episode_number = "208"
    if(episode_number=="is"): episode_number = "300"
    if(episode_number=="isn-t"): episode_number = "340"
    if(episode_number=="luckiest"): episode_number = "311"
    if(episode_number=="rules"): episode_number = "341"
    if(episode_number=="strategy"): episode_number = "210"
    return str(episode_number)

### Step 2.1: Rename downloaded files (cleaning up titles) and put in a subdirectory for cleaned up titles for both caption and full_text files
from file cleaning-titles.ipynb

In [18]:
import unicodedata
import re
import glob
import shutil

def create_clean_title(main_dir, extension=".vtt"):
    output_dir = create_output_path(main_dir+"clean/")
        
    for file_name in glob.glob(r'{}*{}'.format(main_dir, extension)):
        new_file_name, ep_number = get_slugify(file_name)
        dest = output_dir+new_file_name+extension
        shutil.copy2(file_name, dest)
    print("==================================================")
    print("DONE: CREATED CLEAN FILES IN ", output_dir)
    print("==================================================")

create_clean_title(main_dir="captions/", extension= ".vtt")
create_clean_title(main_dir="full_text/", extension= ".txt")

DONE: CREATED CLEAN FILES IN  captions/clean/
DONE: CREATED CLEAN FILES IN  full_text/clean/


### Step 2.2: For captions: Take in renamed files from previous step and clean their contents up by removing empty spaces and make it into one single line for time and what was said
from combine_lines_from combine_lines_from_cleaned.ipynb

In [65]:
import glob
import os

dic = {}
paths_to_files = "./captions/clean/"
output_path = create_output_path("episodes/captions/")

def create_clean_caption_files():
    for file_path in glob.glob(paths_to_files+"*.vtt"):
        # get the episode number
        file_number = file_path.split("/")[-1].split("-")[2]
        # add file contents to a dict
        with open(file_path) as f:
            arr = []
            for word in f.read().strip().split("\n\n")[1:]:
                cleaning_word = word.split("\n", 1)[1].replace("\n", " ").split(" --> ")
                # add [ and ] for first timestamp
                cleaning_word[0] = "["+ cleaning_word[0].split(".")[0] + "]"
                # skip second timestamp
                cleaning_word[1] = cleaning_word[1].split(" ", 1)[1]
                # join back together
                cleaning_word = " ".join(cleaning_word)
                arr.append(cleaning_word)

            # add this to file with correct episode number
            text_file = open(output_path+"/"+file_number+".txt", "w")
            text_file.write("\n".join(arr))
            text_file.close()
            dic[file_number] = arr
            
create_clean_caption_files()

In [14]:
import shutil
def create_episode_folder_full_text(main_dir="full_text/", extension=".txt"):
    output_dir = create_output_path("episodes/"+main_dir)
        
    for file_name in glob.glob("./full_text/clean/*.txt"):
        new_file_name, ep_number = get_slugify(file_name)
        dest = output_dir+ep_number+extension
        shutil.copy2(file_name, dest)
create_episode_folder_full_text()

### Step 3: Create hyperlinks to each episode for the React website
from create_hrefs.ipynb

In [20]:
import unicodedata
import re
import glob
import shutil
from slugify import slugify
import os

def create_hrefs(main_dir="./captions/", extension=".vtt"):
      arr = []
      dic = {}
      # iterate through all files with extension (.vtt)
      for file_name in glob.glob(r'{}*{}'.format(main_dir, extension)):
            # clean up the file name and also replace w -> with
            new_file_name = slugify(file_name.split("\\")[-1].split("/")[-1].split(".")[0]).replace("-w-", "-with-")
            # get the episode number
            ep_number = new_file_name.split("-")[2]
            # our new title which has been cleaned up
            new_title = "Jocko Podcast Episode #" + ep_number+ " - " + " ".join(new_file_name.split("-")[3:])
            # save a href link to a dictionary
            dic[int(ep_number)] = "<a href='episode/"+ep_number+"'>"+new_title+"</a>"

      # have to sort the dictionary since items are not in order
      for k, v in dict(sorted(dic.items())).items():
            # save href to array in correct order
            arr.append(v)
      # add this to a single file with all links to episodes
      text_file = open("episodes/hrefs.txt", "w")
      text_file.write("\n".join(arr))
      text_file.close()
    
create_hrefs()

### Create title files

In [21]:
import unicodedata
import re
import glob
import shutil
from slugify import slugify
import os

output_path = create_output_path("episodes/titles")

def create_title_files(main_dir="./full_text/", extension=".txt"):
      arr = []
      dic = {}
      # iterate through all files with extension (.vtt)
      for file_name in glob.glob(r'{}*{}'.format(main_dir, extension)):
            new_file_name, ep_number = get_slugify(file_name)
            # our new title which has been cleaned up
            new_title = "Jocko Podcast Episode #" + ep_number+ " - " + " ".join(new_file_name.split("-")[3:])
            # add this to file with correct episode number
            text_file = open(output_path+"/"+ep_number+".txt", "w")
            text_file.write(new_title)
            text_file.close()

create_title_files()

### Create links to original video

In [22]:
import unicodedata
import re
import glob
import shutil
from slugify import slugify
import os

output_path = create_output_path("episodes/links")

def create_link_to_youtube_episode(main_dir="./full_text/", extension= ".txt"):
      arr = []
      dic = {}
      # iterate through all files with extension (.vtt)
      for file_name in glob.glob(r'{}*{}'.format(main_dir, extension)):
            new_file_name, ep_number = get_slugify(file_name)
            # our new title which has been cleaned up
            new_title = "<a href=''><img src=''></a>"
            # # add this to file with correct episode number
            text_file = open(output_path+"/"+ep_number+".txt", "w")
            text_file.write(new_title)
            text_file.close()

## only uncomment if you are absolutely sure since this replaces manual edits
# create_link_to_youtube_episode()

### Step 4: Summarize text

In [4]:
import glob
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import os
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud

stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')
punctuation = punctuation + '\n' #x

def create_word_cloud(episode_number, text):
  wordcloud = WordCloud(max_font_size=40).generate(text)
  plt.figure()#dpi=1200)
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")
  # plt.tight_layout(pad=0)
  #plt.show()
  image = wordcloud.to_image()
  output_path = create_output_path("episodes/wordcloud/")
  image.save(output_path+episode_number+".jpg")


def get_episode_summary(main_dir="./full_text/", extension=".txt", length=0.03):
  for file_name in glob.glob(r'{}*{}'.format(main_dir, extension)):
    text = open(file_name, "r").read()
    doc = nlp(text)
    tokens = [token.text for token in doc] #x

    word_frequencies = {} #x
    for word in doc:
      if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
          if word.text not in word_frequencies.keys():
            word_frequencies[word.text] = 1
          else:
            word_frequencies[word.text] += 1

    max_frequency = max(word_frequencies.values()) #x

    for word in word_frequencies.keys():
      word_frequencies[word] = word_frequencies[word]/max_frequency #x

    sentence_tokens = [sent for sent in doc.sents] #x

    sentence_scores = {} #x
    for sent in sentence_tokens:
      for word in sent:
        if word.text.lower() in word_frequencies.keys():
          if sent not in sentence_scores.keys():
            sentence_scores[sent] = word_frequencies[word.text.lower()]
          else:
            sentence_scores[sent] += word_frequencies[word.text.lower()]


    select_length = int(len(sentence_tokens)*length) #x
    summary = ' '.join([word.text for word in nlargest(select_length, sentence_scores, key = sentence_scores.get)]) #x
    
    _, ep_number = get_slugify(file_name)
    output_path = create_output_path("episodes/summarize/")
    text_file = open(output_path+"/"+ep_number+".txt", "w")
    text_file.write(summary)
    text_file.close()

    # Creating a word cloud files
    #create_word_cloud(episode_number=ep_number, text=text)

# # get summary of episode and also wordcloud
get_episode_summary(length=0.015)

### Link to youtube

In [50]:
import json
f = open("archive/yt_metadata/videos.json", "r").read()
res = json.loads(f)
arr = []
def create_link_files_to_youtube():
    for i in res:
        video_id = i["id"]
        arr.append(video_id)
        item = i["snippet"]
        title = item["title"]
        episode_number = get_clean_episode_number(title)

        thumbnail = item["thumbnails"]["high"]["url"]

        # # add this to file with correct episode number
        # new_title = "<a href='https://www.youtube.com/watch?v="+video_id+"'><img src='"+thumbnail+"'></a>"
        # output_path = create_output_path("episodes/links")
        # text_file = open(output_path+"/"+episode_number+".txt", "w")
        # text_file.write(new_title)
        # text_file.close()

create_link_files_to_youtube()

## Create JSON link

In [54]:
import json
f = open("archive/yt_metadata/videos.json", "r").read()
res = json.loads(f)
arr = []
def create_json_link():
    for i in res:
        video_id = i["id"]
        arr.append(video_id)
        item = i["snippet"]
        title = item["title"]
        episode_number = get_clean_episode_number(title)
        thumbnail = item["thumbnails"]["default"]["url"]
        new_thumbnail = thumbnail.rsplit('/', 1)[0]
        #print(thumbnail)
        # # add this to file with correct episode number
        output_path = create_output_path("episodes/thumbnails")
        text_file = open(output_path+"/"+episode_number+".txt", "w")
        text_file.write(json.dumps(new_thumbnail))
        text_file.close()

        output_path = create_output_path("episodes/json")
        text_file = open(output_path+"/"+episode_number+".json", "w")
        text_file.write(json.dumps(item))
        text_file.close()

create_json_link()

In [24]:
res[0].keys()

dict_keys(['kind', 'etag', 'id', 'snippet', 'contentDetails', 'status', 'statistics', 'topicDetails', 'recordingDetails'])

In [43]:
res[0]["statistics"]

{'viewCount': '286829',
 'likeCount': '5997',
 'favoriteCount': '0',
 'commentCount': '410'}

# Fixing broken url for website when using videos.json

In [76]:
import json
f = open("archive/yt_metadata/videos.json", "r").read()
res = json.loads(f)
arr = []
def fix_url_episode_link():
    for item in res:
        title = item["snippet"]["title"]
        episode_number = get_clean_episode_number(title)
        item["episode_number"] = int(episode_number)
        arr.append(item)

        # # # add this to file with correct episode number
        # output_path = create_output_path("episodes/thumbnails")
        # text_file = open(output_path+"/"+episode_number+".txt", "w")
        # text_file.write(json.dumps(new_thumbnail))
        # text_file.close()

        # output_path = create_output_path("episodes/json")
        # text_file = open(output_path+"/"+episode_number+".json", "w")
        # text_file.write(json.dumps(item))
        # text_file.close()

fix_url_episode_link()
output_path = create_output_path("episodes/")
with open(output_path+"/videos.json", "w") as my_file:
    json.dump(arr, my_file)
#text_file.write(str(arr))
#text_file.close()

In [74]:
f = open("archive/yt_metadata/videos.json", "r").read()
res = json.loads(f)
res

[{'kind': 'youtube#video',
  'etag': 'Y6kyUMohsZwFEL6uzaT_yAbp_P8',
  'id': 'dv-OqLsV2T8',
  'snippet': {'publishedAt': '2015-12-28T22:02:15Z',
   'channelId': 'UCkqcY4CAuBFNFho6JgygCnA',
   'title': 'Jocko Podcast #1 - With Echo Charles | Leadership, Ownership, Mental Toughness',
   'description': 'Navy SEAL, Jocko Willink and Director, Echo Charles discuss leadership, ownership, and mental toughness.  Extreme Ownership.\n\nAvailable on iTunes and Stitcher.',
   'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/default.jpg',
     'width': 120,
     'height': 90},
    'medium': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/mqdefault.jpg',
     'width': 320,
     'height': 180},
    'high': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/hqdefault.jpg',
     'width': 480,
     'height': 360},
    'standard': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/sddefault.jpg',
     'width': 640,
     'height': 480},
    'maxres': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/maxresdef

In [77]:
f = open("episodes/videos.json", "r").read()
res = json.loads(f)
res

[{'kind': 'youtube#video',
  'etag': 'Y6kyUMohsZwFEL6uzaT_yAbp_P8',
  'id': 'dv-OqLsV2T8',
  'snippet': {'publishedAt': '2015-12-28T22:02:15Z',
   'channelId': 'UCkqcY4CAuBFNFho6JgygCnA',
   'title': 'Jocko Podcast #1 - With Echo Charles | Leadership, Ownership, Mental Toughness',
   'description': 'Navy SEAL, Jocko Willink and Director, Echo Charles discuss leadership, ownership, and mental toughness.  Extreme Ownership.\n\nAvailable on iTunes and Stitcher.',
   'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/default.jpg',
     'width': 120,
     'height': 90},
    'medium': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/mqdefault.jpg',
     'width': 320,
     'height': 180},
    'high': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/hqdefault.jpg',
     'width': 480,
     'height': 360},
    'standard': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/sddefault.jpg',
     'width': 640,
     'height': 480},
    'maxres': {'url': 'https://i.ytimg.com/vi/dv-OqLsV2T8/maxresdef