In [15]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import pandas as pd
import json

In [62]:
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 250
BATCH_SIZE = 32

In [80]:
dataset_path = os.path.join(os.path.abspath(""), 'podcasts-no-audio-13GB')


In [81]:
metadata_path_train = os.path.join(dataset_path, 'metadata.tsv')
metadata_train = pd.read_csv(metadata_path_train, sep='\t')
print("Columns: ", metadata_train.columns)
print("Shape: ", metadata_train.shape)

Columns:  Index(['show_uri', 'show_name', 'show_description', 'publisher', 'language',
       'rss_link', 'episode_uri', 'episode_name', 'episode_description',
       'duration', 'show_filename_prefix', 'episode_filename_prefix'],
      dtype='object')
Shape:  (105360, 12)


In [82]:
i = 0
episode_example_train = metadata_train.iloc[i]
print(episode_example_train)
print("\nCopy this uri into the browser to listen to the episode:\n",
      episode_example_train['episode_uri'])


show_uri                                 spotify:show:2NYtxEZyYelR6RMKmjfPLB
show_name                                               Kream in your Koffee
show_description           A 20-something blunt female takes on the world...
publisher                                                        Katie Houle
language                                                              ['en']
rss_link                            https://anchor.fm/s/11b84b68/podcast/rss
episode_uri                           spotify:episode:000A9sRBYdVh66csG2qEdj
episode_name                                         1: It’s Christmas Time!
episode_description        On the first ever episode of Kream in your Kof...
duration                                                           12.700133
show_filename_prefix                             show_2NYtxEZyYelR6RMKmjfPLB
episode_filename_prefix                               000A9sRBYdVh66csG2qEdj
Name: 0, dtype: object

Copy this uri into the browser to listen to the epis

In [83]:
def get_path(episode):
    # extract the 2 reference number/letter to access the episode transcript
    show_filename = episode['show_filename_prefix']
    episode_filename = episode['episode_filename_prefix'] + ".json"
    dir_1, dir_2 = re.match(r'show_(\d)(\w).*', show_filename).groups()

    interval_folders = [range(0, 3), range(3, 6), range(6, 8)]

    # check which is the main folder containing the transcript
    main_dir = ""
    for interval in interval_folders:
        if int(dir_1) in interval:
            main_dir = "podcasts-transcripts-{}to{}".format(interval[0],
                                                            interval[-1])
    assert main_dir != ""

    # check if the transcript file in all the derived subfolders exist
    transcipt_path = os.path.join(dataset_path, main_dir, "spotify-podcasts-2020",
                                "podcasts-transcripts", dir_1, dir_2,
                                show_filename, episode_filename)

    return transcipt_path

In [84]:
i = 0
while not os.path.isfile(get_path(metadata_train.iloc[i])):
    i = random.randint(0,10000)
    print(get_path(metadata_train.iloc[i]))

/Users/simone/UniBO/Lab/NLP/PodcastSummarization/podcasts-no-audio-13GB/podcasts-transcripts-6to7/spotify-podcasts-2020/podcasts-transcripts/6/r/show_6r0bfT6lOLqDMh9ujnT1BW/0VCNFvmtEUvMEHQIEmt8O4.json
/Users/simone/UniBO/Lab/NLP/PodcastSummarization/podcasts-no-audio-13GB/podcasts-transcripts-0to2/spotify-podcasts-2020/podcasts-transcripts/0/Q/show_0Q4ThOFNfLmFPESqAjFctV/0P6iqbGXRjpjbcaOCYpbHY.json


In [85]:
episode = metadata_train.iloc[i]
episode

show_uri                                 spotify:show:0Q4ThOFNfLmFPESqAjFctV
show_name                                                       AstrologyNow
show_description           Since ancient times, sages and mystics have ut...
publisher                                                Christine Rodriguez
language                                                           ['en-US']
rss_link                             https://anchor.fm/s/7efcfc0/podcast/rss
episode_uri                           spotify:episode:0P6iqbGXRjpjbcaOCYpbHY
episode_name               The Third House in Vedic Astrology: Courage, C...
episode_description        The Third House represents short distance trav...
duration                                                            11.70595
show_filename_prefix                             show_0Q4ThOFNfLmFPESqAjFctV
episode_filename_prefix                               0P6iqbGXRjpjbcaOCYpbHY
Name: 5544, dtype: object

In [86]:
def get_transcription(episode):
    with open(get_path(episode), 'r') as f:
        episode_json = json.load(f)
        # seems that the last result in each trastcript is a repetition of the first one, so we ignore it
        transcripts = [
            result["alternatives"][0]['transcript'] if 'transcript' in result["alternatives"][0] else ""
            for result in episode_json["results"][:-1]
        ]
        return " ".join(transcripts)


print(f"Episode description:\n{episode_example_train['episode_description']}")
print(f"\nEpisode transcription:\n{' '.join(transcripts)}")

Episode description:
On the first ever episode of Kream in your Koffee, Katie talks about tips for Christmas shopping. We also get a little insight into who and what we’ll be hearing about in next weeks episode! 

Episode transcription:
 One of my professors once asked me and the rest of my class one. Is it that you feel the most connected to the world and I still remember my answer. So I grew up on top of a hill that overlooked the Blue Ridge Mountains. And the Shenandoah Valley my house is right at the top and one of my favorite things to do growing up was to run from the top of the hill all the way down to the bottom where the stream was. I was always Barefoot my hair let loose and my arms would flail around like crazy and I would do this most often in the pouring down rain and sometimes at night to which added a little bit of extra risk, but it just made me feel so alive. I felt totally free.  Just letting myself go and be carried by gravity to the bottom of the hill and usually lo

In [99]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

X_vectorizer = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH)
y_vectorizer = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_mode='int', output_sequence_length=SEQUENCE_LENGTH + 1, standardize=custom_standardization)

In [None]:
X, y = [], []
for _, row in metadata_train.iloc[:1000].iterrows():
    if os.path.isfile(get_path(row)):
        X.append(get_transcription(row))
        y.append(row['episode_description'])
y[0], X[0]

In [103]:
def format_strings(transcription, summary):
    transcription = X_vectorizer(transcription)
    summary = y_vectorizer(summary)
    return ({"encoder_inputs": transcription, "decoder_inputs": summary[:, :-1],}, summary[:, 1:])

In [104]:
X_vectorizer.adapt(X)
y_vectorizer.adapt(y)
dataset = tf.data.Dataset.from_tensor_slices((np.array(X),np.array(y)))
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.map(format_strings)
dataset = dataset.shuffle(2048).prefetch(16).cache()

In [63]:
class TransformerEncoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(keras.layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(latent_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.layernorm_3 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [93]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,),
                             dtype="int64",
                             name="encoder_inputs")
x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,),
                             dtype="int64",
                             name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim),
                                 name="decoder_state_inputs")
x = PositionalEmbedding(SEQUENCE_LENGTH, VOCAB_SIZE, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs, name="decoder")

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model([encoder_inputs, decoder_inputs],
                          decoder_outputs,
                          name="transformer")

In [106]:

transformer.summary()
transformer.compile("rmsprop",
                    loss="sparse_categorical_crossentropy",
                    metrics=["accuracy"])
transformer.fit(dataset, epochs=10)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_10 (Posit  (None, None, 256)   2624000     ['encoder_inputs[0][0]']         
 ionalEmbedding)                                                                                  
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_5 (Transfo  (None, None, 256)   3155456     ['positional_embedding_

<keras.callbacks.History at 0x7fe220e0f070>