# Load Dependencies

In [None]:
%%capture
!pip install kaggle
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import math
import copy
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import random
import re
from collections import Counter
import nltk
nltk.download("punkt")
nltk.download('stopwords')

In [None]:
!mkdir /root/.kaggle/
!cp -f ./kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c facebook-recruiting-iii-keyword-extraction
!rm -f SampleSubmission.csv.zip
!rm -f Test.zip
!unzip Train.zip
!rm -f Train.zip

Downloading SampleSubmission.csv.zip to /content
100% 4.81M/4.81M [00:00<00:00, 30.0MB/s]

Downloading Train.zip to /content
100% 2.19G/2.19G [00:26<00:00, 38.7MB/s]
100% 2.19G/2.19G [00:26<00:00, 89.6MB/s]
Downloading Test.zip to /content
 98% 707M/725M [00:18<00:00, 168MB/s]
100% 725M/725M [00:18<00:00, 41.2MB/s]
Archive:  Train.zip
  inflating: Train.csv               


# Process Data In.

In [None]:
DataFrame = pd.read_csv("./Train.csv", nrows = 15000)

In [None]:
titles = [title for title in DataFrame['Title']]
tags = [tags for tags in DataFrame['Tags']]

In [None]:
def prepare_tags(sentences, list_tags, min_count):
  '''
  Tags: List of Sentences, we will compute the unique tags
  To prevent way too sparse Ground Truth Values, we will impose a minimum count for how many times a tag must be mentioned to be included
  '''
  all_tags = [tag for sentence_tags in tqdm.tqdm(list_tags) for tag in sentence_tags.split()]
  count_tags = Counter(all_tags)
  all_tags = list(set(all_tags))
  # Hash map to map tag to index
  tags_idx = {}
  count = 0
  for tag in all_tags:
    if count_tags[tag] > min_count:
      tags_idx[tag] = count
      count += 1
  len_of_tags = len(tags_idx)
  tokenized_tags = []
  sentence_dataset = []
  for sentence_tags_idx in tqdm.tqdm(range(len(list_tags))):
    sentence_tags = list_tags[sentence_tags_idx]
    token_sent = [0] * len_of_tags
    for tag in sentence_tags.split():
      if tag in tags_idx:
        token_sent[tags_idx[tag]] = 1
    count_tokens = Counter(token_sent)
    if count_tokens[1] > 0:
      tokenized_tags += [token_sent]
      sentence_dataset += [sentences[sentence_tags_idx]]
  return sentence_dataset, np.array(tokenized_tags), tags_idx

In [None]:
title_dataset, tag_dataset, tags_idx = prepare_tags(titles, tags, 25)
num_classes = tag_dataset.shape[1]

HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




In [None]:
val_title_dataset = title_dataset[-50:]
title_dataset = title_dataset[:-50]

val_tag_dataset = tag_dataset[-50:]
tag_dataset = tag_dataset[:-50]

In [None]:
def prepare_title(title_dataset):
  '''
  Since code like C++ has punctutation, we only lower the tags and remove stopwords as processing
  '''
  stopwords = nltk.corpus.stopwords.words("english")
  tokenized_titles = [] 
  for title in tqdm.tqdm(title_dataset):
    tokenized_title = []
    for word in nltk.word_tokenize(title):
      if word not in stopwords:
        tokenized_word = re.sub(r"[^\w\s]", "", str.lower(word))
        if tokenized_word != "":
          tokenized_title += [tokenized_word]
    tokenized_titles += [tokenized_title]
  return tokenized_titles

In [None]:
tokenized_titles = prepare_title(title_dataset)
tokenized_val = prepare_title(val_title_dataset)

HBox(children=(FloatProgress(value=0.0, max=12572.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [None]:
class StackOverflowDataset(keras.utils.Sequence):
  def __init__(self, titles, tags, batch_size):
    self.titles = titles
    self.tags = tags
    self.batch_size = batch_size
    self.cur_idx = 0
  def __len__(self):
    return len(self.titles) // self.batch_size
  def __getitem__(self, idx):
    titles = self.titles[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
    tags = self.tags[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
    self.cur_idx += 1
    if self.cur_idx >= self.__len__():
      self.cur_idx = 0
    return titles, tags

In [None]:
SODataset = StackOverflowDataset(copy.deepcopy(tokenized_titles), tag_dataset, 32)

# Embedding(No Pretrained, since words like C++ and other programming terms won't exist in GLoVE)

In [None]:
class WordEmbeddings(keras.layers.Layer):
  def __init__(self, corpus, dim = 256):
    super().__init__()
    self.dim = dim
    self.unique_words = self._compute_unique_words(corpus)
    self.vocab_len = len(self.unique_words) + 2
    
    self.word_2_idx = {self.unique_words[idx]: idx + 2 for idx in range(len(self.unique_words))}
    self.word_2_idx["<PAD>"] = 0
    self.word_2_idx["<UNK>"] = 1
  
    self.idx_2_word = {idx + 2: self.unique_words[idx] for idx in range(len(self.unique_words))}
    self.idx_2_word[0] = "<PAD>"
    self.idx_2_word[1] = "<UNK>"

    self.embeddings = keras.layers.Embedding(self.vocab_len, self.dim)
  def _compute_unique_words(self, corpus):
    '''
    Computes all unique words inside of the corpus.
    '''
    all_words = [word for sentence in corpus for word in sentence]
    return list(set(all_words))
  def _compute_max_length(self, x):
    max_length = 0
    for sentence in x:
      max_length = max(len(sentence), max_length)
    return max_length
  def _prep_sentences(self, x, max_sent_length):
    '''
    Pads all sentences(or truncates)
    '''
    tokenized_sentences = []
    for sentence in x:
      padded_sentences = [self.word_2_idx['<PAD>']] * max_sent_length
      for word_idx in range(max_sent_length):
        if word_idx >= len(sentence):
          break
        word = sentence[word_idx]
        if word in self.word_2_idx:
          padded_sentences[word_idx] = self.word_2_idx[word]
        else:
          padded_sentences[word_idx] = self.word_2_idx["<UNK>"]
      tokenized_sentences += [padded_sentences]
    return tokenized_sentences
  def call(self, x, max_sent_length = None):
    '''
    Tokenizes a Sentence
    x: list of sentences(list of words)
    '''
    if not max_sent_length:
      max_sent_length = self._compute_max_length(x)
    # Pad Sentences up to max_sentence length
    tokenized = np.array(self._prep_sentences(x, max_sent_length))
    return self.embeddings(tokenized)

# Build the Transformer

In [None]:
class QANetConv(keras.layers.Layer):
  '''
  One QANet Conv Block
  '''
  def __init__(self, in_features):
    super().__init__()
    self.in_features = in_features
    self.layer_norm = keras.layers.LayerNormalization()
    self.conv = keras.layers.Conv1D(self.in_features, 7, padding = 'same')
  def call(self, x):
    return self.conv(self.layer_norm(x)) + x

In [None]:
class MultiHeadSelfAttention(keras.layers.Layer):
  '''
  MultiHead Self Attention, implemented in TensorFlow + Keras.
  '''
  def __init__(self, in_dim, inner_dim, num_heads):
    super().__init__()
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.num_heads = num_heads
    self.K = keras.layers.Dense(self.inner_dim * self.num_heads)
    self.V = keras.layers.Dense(self.inner_dim * self.num_heads)
    self.Q = keras.layers.Dense(self.inner_dim * self.num_heads)
    self.Linear = keras.layers.Dense(self.in_dim)
  def call(self, x):
    '''
    x: Tensor(B, L, C)
    '''
    B, L, _ = x.shape
    Keys = self.K(x)
    Values = self.V(x)
    Queries = self.Q(x) # (B, L, self.inner_dim * self.num_heads)
    # Reshape Eager Tensors
    Keys = tf.reshape(Keys, (B, L, self.num_heads, self.inner_dim))
    Values = tf.reshape(Values, (B, L, self.num_heads, self.inner_dim))
    Queries = tf.reshape(Queries, (B, L, self.num_heads, self.inner_dim))
    # Transpose
    Keys = tf.transpose(Keys, perm = (0, 2, 1, 3))
    Values = tf.transpose(Values, perm = (0, 2, 1, 3)) 
    Queries = tf.transpose(Queries, perm = (0, 2, 1, 3)) # (B, self.num_heads, L, self.inner_dim)
    # Reshape Eager Tensors
    Keys = tf.reshape(Keys, (B * self.num_heads, L, self.inner_dim))
    Values = tf.reshape(Values, (B * self.num_heads, L, self.inner_dim))
    Queries = tf.reshape(Queries, (B * self.num_heads, L, self.inner_dim))
    # Dot Product Attention
    att_mat = tf.matmul(Keys, tf.transpose(Queries, perm = (0, 2, 1))) / math.sqrt(self.inner_dim)
    att_scores = tf.keras.activations.softmax(att_mat)

    vals = tf.matmul(att_scores, Values) # (BH, L, InnerDim)
    # Split up Heads
    vals = tf.reshape(vals, (B, self.num_heads, L, self.inner_dim))
    vals = tf.transpose(vals, perm = (0, 2, 1, 3))
    vals = tf.reshape(vals, (B, L, self.num_heads * self.inner_dim))
    return self.Linear(vals)


In [None]:
class EncoderBlock(keras.layers.Layer):
  '''
  A QANet Encoder Block 
  '''
  def __init__(self, in_dim, inner_dim, num_heads, num_convs):
    super().__init__()
    # Conv Blocks
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.num_heads = num_heads
    self.num_convs = num_convs
    self.convBlocks = keras.Sequential([
      QANetConv(self.in_dim) for i in range(self.num_convs)
    ])
    # Part 2: MultiHead Attention + Layer Norm
    self.MHA = MultiHeadSelfAttention(self.in_dim, self.inner_dim, self.num_heads)
    self.MHAnorm = keras.layers.LayerNormalization()
    # Part 3: Linear + Layer Norm
    self.Linear = keras.layers.Dense(self.in_dim, activation = "relu")
    self.LinearNorm = keras.layers.LayerNormalization()
  def call(self, x):
    '''
    x: Tensor(B, L, C)
    '''
    positional_encoded = self._add_positional_embeddings(x)
    conv = self.convBlocks(positional_encoded)
    attended = self.MHAnorm(self.MHA(conv)) + conv
    linear = self.LinearNorm(self.Linear(attended)) + attended
    return linear
  def _add_positional_embeddings(self, x):
    '''
    Adds in the positional_embeddings using the sinusoidal functions
    x: Tensor(B, L, C)
    '''
    B, L, C = x.shape
    positional_embeddings = np.zeros((L, C))
    for pos in range(L):
      for i in range(0, C, 2):
        positional_embeddings[pos, i] = math.sin(pos / 10000 ** (2 * i / self.in_dim))
        positional_embeddings[pos, i + 1] = math.cos(pos / 10000 ** ((2 * i + 2) / self.in_dim))
    batch_pos = []
    for b in range(B):
      batch_pos += [copy.deepcopy(positional_embeddings)]
    batch_pos = np.stack(batch_pos, axis = 0)
    return batch_pos + x


In [None]:
class QANet(keras.Model):
  def __init__(self, corpus, in_dim, inner_dim, num_encoders, num_classes, max_sent_length):
    super().__init__()
    self.max_sent_length = max_sent_length
    self.num_classes = num_classes
    self.embeddings = WordEmbeddings(corpus, dim = in_dim)
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.num_encoders = num_encoders
    self.num_heads = 4
    self.num_convs = 4
    self.Encoders = keras.Sequential([
      EncoderBlock(in_dim, inner_dim, self.num_heads, self.num_convs) for i in range(self.num_encoders)
    ])
    self.Dense = keras.layers.Dense(self.num_classes)
  def call(self, x):
    '''
    x: List of Sentences(List of Words)
    '''
    embeddings = self.embeddings(x, max_sent_length = self.max_sent_length) # (B, L, C)
    # Encode the Embeddings
    encoded = self.Encoders(embeddings) # (B, L, C)
    # Average the Logits
    avg = tf.reduce_mean(encoded, axis = 1) # (B, C)
    return self.Dense(avg)


# Training

In [None]:
# Load all necessary TPU functions
num_devices = 8
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver = tpu)
strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver= tpu)

INFO:tensorflow:Initializing the TPU system: grpc://10.96.35.202:8470
INFO:tensorflow:Clearing out eager caches
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*

In [None]:
with strategy.scope():
  model = QANet(tokenized_titles, 256, 128, 6, num_classes, 20)
  optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(1e-4, 500 * num_devices, 0.95,staircase = True))
  criterion = tf.keras.losses.BinaryCrossentropy(from_logits = True, reduction = tf.keras.losses.Reduction.SUM)
  training_loss = tf.keras.metrics.Mean("training_loss")

Train the Model using TPU.

In [None]:
@tf.function
def training_fn():
  def step():
    '''
    Computation to run on each TPU device
    '''
    for titles, labels in SODataset:
      with tf.GradientTape() as tape:
        predicted = model(titles)
        loss = criterion(labels, predicted)
      grads = tape.gradient(loss, model.trainable_weights)
      optimizer.apply_gradients(zip(grads, model.trainable_weights))
      training_loss.update_state(loss * strategy.num_replicas_in_sync)
      break
  strategy.run(step)

In [None]:
def test_fn(titles):
  predicted = tf.keras.activations.sigmoid(model(titles, training = False)).numpy()
  ones = predicted >= 0.5
  predicted[:, :] = 0
  predicted[ones] = 1
  return predicted

In [None]:
def training_loop(NUM_EPOCHS, NUM_STEPS):
  for EPOCH in tqdm.tqdm(range(NUM_EPOCHS)):
    for STEP in range(NUM_STEPS):
      training_fn();
    # Validation Run
    predicted = test_fn(tokenized_val)
    # Rather than counting the number of correct(Numerous since sparse GT), we count the number of incorrect per batch item
    print(np.sum((predicted != val_tag_dataset).astype(np.int32)) / predicted.shape[0])
    print(f"EPOCH: {EPOCH}, total_loss: {training_loss.result()}")

In [None]:
training_loop(50, 1024)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

2.04
EPOCH: 0, total_loss: 4.305730819702148
2.04
EPOCH: 1, total_loss: 2.1563608646392822
2.04
EPOCH: 2, total_loss: 1.4382967948913574
2.04
EPOCH: 3, total_loss: 1.0789644718170166
2.04
EPOCH: 4, total_loss: 0.8632692098617554
2.04
EPOCH: 5, total_loss: 0.7194433212280273
2.04
EPOCH: 6, total_loss: 0.6166657209396362
2.04
EPOCH: 7, total_loss: 0.5395824909210205
2.04
EPOCH: 8, total_loss: 0.47962889075279236
2.04
EPOCH: 9, total_loss: 0.43166598677635193
2.04
EPOCH: 10, total_loss: 0.3924236297607422
2.04
EPOCH: 11, total_loss: 0.35972166061401367
2.04
EPOCH: 12, total_loss: 0.33205077052116394
2.04
EPOCH: 13, total_loss: 0.3083328604698181
2.04
EPOCH: 14, total_loss: 0.2877773344516754
2.04
EPOCH: 15, total_loss: 0.26979124546051025
2.06
EPOCH: 16, total_loss: 0.25392118096351624
2.04
EPOCH: 17, total_loss: 0.23981444537639618
2.04
EPOCH: 18, total_loss: 0.22719262540340424
2.04
EPOCH: 19, total_loss: 0.21583299338817596
2.04
EPOCH: 20, total_loss: 0.20555523037910461
2.04
EPOCH: 21