In [2]:
import wget
import feedparser
import glob
import os
from pydub import AudioSegment
import io
import os
import wave
from google.cloud import storage
from google.cloud import storage
from google.cloud import speech_v1 as speech
from google.oauth2 import service_account
from collections import defaultdict
import numpy as np
import psutil
import ray
import multiprocessing
from ray.util.multiprocessing.pool import Pool # NOTE: Only the import statement is changed.


In [3]:
RSS_Target = [
    "http://feed.thisamericanlife.org/talpodcast",
    "https://feeds.simplecast.com/54nAGcIl",
    "https://feeds.megaphone.fm/stuffyoushouldknow",
    "https://feeds.simplecast.com/kwWc0lhf",
    "https://feeds.npr.org/381444908/podcast.xml",
    "https://feeds.npr.org/510318/podcast.xml",
    "https://feeds.npr.org/510289/podcast.xml",
    "https://feeds.npr.org/510313/podcast.xml",
    "https://feeds.npr.org/510338/podcast.xml",
   "https://feeds.buzzsprout.com/258327.rss"
]

In [4]:
storage_client = storage.Client()
bucket = storage_client.bucket('ee6893')

### Step 1 - Pull RSS feeds

In [19]:
local_target_dir = "gs://ee6893/final/"

def get_input(rss_len):
    how_many_downloaded = ""
    # Proper grammar
    if rss_len == 0:
        rss_text = "no"
        rss_episodes_text = "episodes"
    elif rss_len == 1:
        rss_text = "is"
        rss_episodes_text = "episode"
    elif rss_len > 1:
        rss_text = "are"
        rss_episodes_text = "episodes"

    # Prompt for how many files to download
    print("There " + rss_text + " " + str(rss_len) + " " + rss_episodes_text + " in this feed.")
    print("RSS feeds are ordered from most recent entries first, descending order in.")
    print("If an episode exists in the target directory the download will skip that file.")
    print("-------------------------------")
    how_many_downloaded = input("How many recent episodes to download? Enter 0 for them all. - ")
    print("-------------------------------")

    return how_many_downloaded


#Error checking for the input on How_Many and if How_Many is GT rss_length
def validate_input(how_many_num, rss_len):
    # Check to see if the user input is a number
    if (how_many_num.isnumeric()) == True:
        # Check to see if it's between 0 and the max number of episodes in the feed
        if int(how_many_num) < 0 or int(how_many_num) > int(rss_len):
            print("INPUT ERROR - Please Enter A Number Between 0 And " + str(rss_len))
            validated = 0
        else:
            validated = 1
    else:
        print("INPUT ERROR - Please Enter A Number Only.")
        validated = 0

    # Return a 1 or 0, guess which means what.
    return validated

def Get_Feed(rss_to_load):
    # Load the RSS Feed
    print("Getting feed - " + rss_to_load + " please wait...")
    rss_feed_load = feedparser.parse(rss_to_load)
    rss_length = len(rss_feed_load.entries)
    print("Feed loaded.")
    
    # Pass back a list
    return rss_feed_load, rss_length


def mp3_to_wav(audio_file_name):
    if audio_file_name.split('.')[1] == 'mp3':    
        sound = AudioSegment.from_mp3(audio_file_name)
        audio_file_name = audio_file_name.split('.')[0] + '.wav'
        sound.export(audio_file_name, format="wav")
        
        
def Download_Files(target_dir, rss_feed):
    # Initialize the download counter
    episode_count = 1

    # FOR LOOP THROUGH EPISODES IN FEED
    for episode in rss_feed.entries:
        title = episode.title + '.wav'
        target = 'final/' + title
        entry_links = episode.links
        # second entry for the mp3 link
        # .links has the download in the second entry
        mp3_link = entry_links[1]
        mp3_href = mp3_link['href']

        # Strip out the mp3 file from the download URL for directory search
        # Break out the / directories from the URL
        temp_mp3_link = mp3_href.split('/')
        # Get the count of the URL items, then get position of the mp3 file,
        # the mp3 file will be the last entry http://example.com/epsiode/whatever/something/episode_111.mp3
        temp_mp3_link_last = len(temp_mp3_link) - 1    # minus one because of the array numbering
        # MP3 filename
        temp_mp3_link = temp_mp3_link[temp_mp3_link_last]
        

        # Make sure the last entry is a .mp3 file
        if ".mp3" in temp_mp3_link:
            # Check to see if there are headers on the URL after .mp3, break them out to get the .mp3 file name
            # Example: http://example.com/epsiode/whatever/something/episode_111.mp3?redirect=1&name=something&where=9
            # If there is no ?, the var is already set to the proper mp3 filename.
            if "?" in temp_mp3_link:
                temp_mp3_link = temp_mp3_link.split("?")
                # Grab the first item, the second will be the headers
                temp_mp3_link = temp_mp3_link[0]
        
        print("Downloading - " + str(title))
        
        filename = wget.download(mp3_link['href'])
        blob = bucket.blob(target)
        mp3_to_wav(filename)
        blob.upload_from_filename(filename, content_type='audio/x-wav')
        os.remove(filename)


In [21]:
for target in RSS_Target:
# Get Started, Load the RSS and break out the feed and file count from the feed
    RSS_Feed = Get_Feed(target)
    RSS_Feed_Items = RSS_Feed[0]

    # User input for number of files to download and make sure user input is clean
    Download_Files(local_target_dir, RSS_Feed_Items)


Getting feed - https://feeds.npr.org/510318/podcast.xml please wait...
Feed loaded.
Downloading - Friday, December 3, 2021.wav
Downloading - Thursday, December 2, 2021.wav
Downloading - Wednesday, December 1, 2021.wav
Downloading - Tuesday, November 30, 2021.wav
Downloading - Monday, November 29, 2021.wav
Downloading - Investigations: CTE And Desperate Patients' "Last Hope".wav


### Step 2 - Create Transcripts

In [22]:
def write_transcripts(transcript_filename,transcript):
    print('writing...')
    blob = bucket.blob('transcript/'+transcript_filename)
    blob.upload_from_string(transcript, content_type='txt')


def transcribe(gcs_uri):
    print('transcribing...')
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)
    
    response = operation.result()
    transcript = ''

    for result in response.results:
        transcript += format(result.alternatives[0].transcript)
    
    return(transcript)

def transcribe_and_write(file):
    if len(file.split('/')[1]) > 0:
        audio_file_name = file.split('/')[1]
        transcript_filename = audio_file_name.split('.')[0] + '.txt'
        print('producing: ', transcript_filename)
        audio_uri = "gs://ee6893/final/{}".format(audio_file_name)
        transcript = transcribe(audio_uri)
        write_transcripts(transcript_filename,transcript)
        
all_files_list = [file.name for file in list(bucket.list_blobs(prefix = 'final/Friday, December 3, 2021.wav'))]

for file in all_files_list:
    transcribe_and_write(file)

producing:  Friday, December 3, 2021.txt
transcribing...
writing...
