In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import pandas as pd
import json
import regex as re
from urllib import request
import zipfile
import glob

In [2]:
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 2500
BATCH_SIZE = 12

In [3]:
dataset_path = os.path.join(os.path.abspath(""), 'podcasts-no-audio-13GB')


In [4]:
metadata_path_train = os.path.join(dataset_path, 'metadata.tsv')
metadata_train = pd.read_csv(metadata_path_train, sep='\t')
print("Columns: ", metadata_train.columns)
print("Shape: ", metadata_train.shape)

Columns:  Index(['show_uri', 'show_name', 'show_description', 'publisher', 'language',
       'rss_link', 'episode_uri', 'episode_name', 'episode_description',
       'duration', 'show_filename_prefix', 'episode_filename_prefix'],
      dtype='object')
Shape:  (105360, 12)


In [5]:
i = 0
episode_example_train = metadata_train.iloc[i]
print(episode_example_train)
print("\nCopy this uri into the browser to listen to the episode:\n",
      episode_example_train['episode_uri'])


show_uri                                 spotify:show:2NYtxEZyYelR6RMKmjfPLB
show_name                                               Kream in your Koffee
show_description           A 20-something blunt female takes on the world...
publisher                                                        Katie Houle
language                                                              ['en']
rss_link                            https://anchor.fm/s/11b84b68/podcast/rss
episode_uri                           spotify:episode:000A9sRBYdVh66csG2qEdj
episode_name                                         1: It’s Christmas Time!
episode_description        On the first ever episode of Kream in your Kof...
duration                                                           12.700133
show_filename_prefix                             show_2NYtxEZyYelR6RMKmjfPLB
episode_filename_prefix                               000A9sRBYdVh66csG2qEdj
Name: 0, dtype: object

Copy this uri into the browser to listen to the epis

In [6]:
def get_path(episode):
    # extract the 2 reference number/letter to access the episode transcript
    show_filename = episode['show_filename_prefix']
    episode_filename = episode['episode_filename_prefix'] + ".json"
    dir_1, dir_2 = re.match(r'show_(\d)(\w).*', show_filename).groups()

    interval_folders = [range(0, 3), range(3, 6), range(6, 8)]

    # check which is the main folder containing the transcript
    main_dir = ""
    for interval in interval_folders:
        if int(dir_1) in interval:
            main_dir = "podcasts-transcripts-{}to{}".format(interval[0],
                                                            interval[-1])
    assert main_dir != ""

    # check if the transcript file in all the derived subfolders exist
    transcipt_path = os.path.join(dataset_path, "spotify-podcasts-2020",
                                "podcasts-transcripts", dir_1, dir_2,
                                show_filename, episode_filename)

    return transcipt_path

In [7]:
i = 0
while not os.path.isfile(get_path(metadata_train.iloc[i])):
    i = random.randint(0,10000)
    print(get_path(metadata_train.iloc[i]))

In [8]:
episode = metadata_train.iloc[i]
episode

show_uri                                 spotify:show:2NYtxEZyYelR6RMKmjfPLB
show_name                                               Kream in your Koffee
show_description           A 20-something blunt female takes on the world...
publisher                                                        Katie Houle
language                                                              ['en']
rss_link                            https://anchor.fm/s/11b84b68/podcast/rss
episode_uri                           spotify:episode:000A9sRBYdVh66csG2qEdj
episode_name                                         1: It’s Christmas Time!
episode_description        On the first ever episode of Kream in your Kof...
duration                                                           12.700133
show_filename_prefix                             show_2NYtxEZyYelR6RMKmjfPLB
episode_filename_prefix                               000A9sRBYdVh66csG2qEdj
Name: 0, dtype: object

In [9]:
def get_transcription(episode):
    with open(get_path(episode), 'r') as f:
        episode_json = json.load(f)
        # seems that the last result in each trastcript is a repetition of the first one, so we ignore it
        transcripts = [
            result["alternatives"][0]['transcript'] if 'transcript' in result["alternatives"][0] else ""
            for result in episode_json["results"][:-1]
        ]
        return " ".join(transcripts)

In [10]:
link_removal_pattern = re.compile(
    r"([\w\s:\-\p{So}]*((https:|www\.).*)$|---.*$)")

metadata_train.show_description = metadata_train.iloc[:10].show_description.apply(lambda desc: link_removal_pattern.sub("", str(desc)))
for row, val in metadata_train.show_description.iloc[:10].iteritems():
    print(row, val)

0 A 20-something blunt female takes on the world and gives you her take on it. Enjoy visits from special guests and friends to give insight and input into interesting situations.
1 Ever wonder what murder took place on today in true crime history? If so, sit back and grab a cup of coffee as you enjoy your daily dose of TC goodness. Your host, Korina Biemesderfer, guides you through history with tales of murder, abduction, serial killers, crimes of passion, cults and more in this short form daily true crime podcast.
2 Inside the 18 is your source for all things Goalkeeping! Each week we are joined by guests from around the world. We recap the weeks events, discuss new training techniques and have candid conversations with professional goalkeepers and goalkeeper coaches. The show is a must listen for the goalkeeping enthusiast!    #insidethe18 #goalkeeperpodcast #thegoalkeepers #goalkicks #ederson #NWSL #ashlynharris #UWSNT #MLS #USMNT #goalkeepercoaches #degea #neuer #navas #areola #obl

In [11]:
class EmbeddingMatrix():
    """Generates an embedding matrix using GloVE, given a vocabulary/
    """
    def __init__(
        self,
        glove_url="http://nlp.stanford.edu/data/glove.6B.zip",
        embedding_dim=100,
        embedding_folder="glove"
    ):
        self.embedding_dim = embedding_dim
        self.download_glove_if_needed(
            glove_url=glove_url, embedding_folder=embedding_folder
        )

        # create the embeddings vocabulary
        self.glove_dict = self.parse_glove(embedding_folder)

    def download_glove_if_needed(self, glove_url, embedding_folder):
        """
        Downloads the glove embeddings from the internet

        Parameters
        ----------
        glove_url : The url of the GloVe embeddings.
        embedding_folder: folder where the embedding will be downloaded
        """
        # create embedding folder if it does not exist
        if not os.path.exists(embedding_folder):
            os.makedirs(embedding_folder)

        # extract the embedding if it is not extracted
        if not glob.glob(
            os.path.join(embedding_folder, "**/glove*.txt"), recursive=True
        ):

            # download the embedding if it does not exist
            embedding_zip = os.path.join(embedding_folder, glove_url.split("/")[-1])
            if not os.path.exists(embedding_zip):
                print("Downloading the GloVe embeddings...")
                request.urlretrieve(glove_url, embedding_zip)
                print("Successful download!")

            # extract the embedding
            print("Extracting the embeddings...")
            with zipfile.ZipFile(embedding_zip, "r") as zip_ref:
                zip_ref.extractall(embedding_folder)
                print("Successfully extracted the embeddings!")
            os.remove(embedding_zip)

    def parse_glove(self, embedding_folder):
        """
        Parses the GloVe embeddings from their files, filling the vocabulary.

        Parameters
        ----------
        embedding_folder : folder where the embedding files are stored

        Returns
        -------
        dictionary representing the vocabulary from the embeddings
        """
        print("Creating glove vocabulary...")
        vocabulary = {"<pad>": np.zeros(self.embedding_dim)}
        embedding_file = os.path.join(
            embedding_folder, "glove.6B." + str(self.embedding_dim) + "d.txt"
        )
        with open(embedding_file, encoding="utf8") as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                vocabulary[word] = coefs
        return vocabulary

    def create_embedding_matrix(self, vocabulary):
        """
        Creates the embedding matrix from the vocabulary.

        Parameters
        ----------
        vocabulary : dictionary representing the vocabulary from the vectorizer

        Returns
        -------
        embedding_matrix : numpy array representing the embedding matrix
        """
        print("Creating embedding matrix...")
        embedding_matrix = np.zeros((len(vocabulary), self.embedding_dim))
        for i, word in enumerate(vocabulary):
            if word in self.glove_dict:
                embedding_matrix[i] = self.glove_dict[word]
            elif word not in ["", "[UNK]"]:
                embedding_matrix[i] = np.random.uniform(size=self.embedding_dim)
        return np.array(embedding_matrix)

In [12]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    no_punct = tf.strings.regex_replace(lowercase,
                                        "[%s]" % re.escape(strip_chars), "")
    no_links = tf.strings.regex_replace(no_punct,
                                        "[\w\s:\-\p{So}]*((https:|www\.).*)$", "")
    return no_links

X_vectorizer = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
y_vectorizer = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH + 1, standardize=custom_standardization)

2022-04-26 15:42:23.550139: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
X, y = [], []
for _, row in metadata_train.iloc[:2000].iterrows():
    if os.path.isfile(get_path(row)) and type(row['episode_description'])==str:
        X.append(get_transcription(row))
        y.append("[start]"+row['episode_description']+"[end]")

In [14]:
def format_strings(transcription, summary):
    transcription = X_vectorizer(transcription)
    summary = y_vectorizer(summary)
    return ({"encoder_inputs": transcription, "decoder_inputs": summary[:, :-1],}, summary[:, 1:])

In [15]:
X_vectorizer.adapt(X)
y_vectorizer.adapt(y)
dataset = tf.data.Dataset.from_tensor_slices((np.array(X),np.array(y)))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.map(format_strings)
dataset = dataset.shuffle(2048).prefetch(16).cache()

In [16]:
class TransformerEncoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, token_embedding_matrix, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = keras.layers.Embedding(input_dim=vocab_size,
                                                       output_dim=embed_dim,
                                                       weights=[token_embedding_matrix])
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(keras.layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(latent_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.layernorm_3 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [17]:
embed_dim = 300
latent_dim = 2048
num_heads = 8
e = EmbeddingMatrix(embedding_dim=embed_dim)

X_embedding_matrix = e.create_embedding_matrix(
    vocabulary=X_vectorizer.get_vocabulary())
y_embedding_matrix = e.create_embedding_matrix(
    vocabulary=y_vectorizer.get_vocabulary())


Creating glove vocabulary...
Creating embedding matrix...
Creating embedding matrix...


In [18]:
encoder_inputs = keras.Input(shape=(None,),
                             dtype="int64",
                             name="encoder_inputs")
x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim, X_embedding_matrix)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,),
                             dtype="int64",
                             name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim),
                                 name="decoder_state_inputs")
x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim, X_embedding_matrix)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs, name="decoder")

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model([encoder_inputs, decoder_inputs],
                          decoder_outputs,
                          name="transformer")

In [19]:

transformer.summary()
transformer.compile("rmsprop",
                    loss="sparse_categorical_crossentropy",
                    metrics=["accuracy"])
transformer.fit(dataset, epochs=10)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 300)   3750000     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 300)   4119848     ['positional_embedding[

In [None]:
y_vocab = y_vectorizer.get_vocabulary()
y_index_lookup = dict(zip(range(len(y_vocab)), y_vocab))
max_decoded_sentence_length = 30
min_length = 10

def decode_sequence(input_sentence):
    tokenized_input_sentence = X_vectorizer([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = y_vectorizer([decoded_sentence
                                                      ])[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(
            predictions[0,
                        i, :]) if i >min_length else np.argsort(predictions[0,
                                                                     i, :])[-2] # Cannot take [end] right after [start]
        sampled_token = y_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


for _ in range(5):
    input_sentence = random.choice(X)
    summarized = decode_sequence(input_sentence)
    print(summarized)

[start] and the and the and the and the and the and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
[start] and the and the and the and the and the and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
[start] and the and the and the and the and the and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
[start] and the and the and the and the and the and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
[start] and the and the and the and the and the and [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
