# Load Dependencies

In [None]:
%%capture
!pip install kaggle
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import sklearn
import re
import collections
import math
import copy
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import random
import nltk
from sklearn.model_selection import StratifiedKFold
!pip install transformers
import transformers
!pip install livelossplot
import livelossplot 
nltk.download("punkt")
nltk.download('stopwords')

In [None]:
!mkdir /root/.kaggle/
!cp -f ./kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c nlp-getting-started

# Load in the Dataset

In [None]:
# HYPER PARAMETERS
NUM_CORES = 8
BATCH_SIZE = 16
LEARNING_RATE = 1e-3
NUM_ATT_HEADS = 4
TEST_SIZE = 32

In [66]:
sample_submission = pd.read_csv("./sample_submission.csv")

In [None]:
train_pd = pd.read_csv("./train.csv")
test_pd = pd.read_csv("./test.csv")

In [None]:
train_tweets = [text for text in train_pd['text']]
train_targets = [target for target in train_pd['target']]
unique_tweets = []
unique_targets = []
for idx in range(len(train_tweets)):
  if train_tweets[idx] not in unique_tweets:
    unique_tweets += [train_tweets[idx]]
    unique_targets += [train_targets[idx]]
# Filter out not unique Tweets
train_tweets = unique_tweets
train_targets = unique_targets

test_tweets = [text for text in test_pd['text']]
test_ids = [id for id in test_pd['id']]

In [None]:
collections.Counter(train_targets)

Counter({0: 4305, 1: 3198})

In [None]:
splitter = sklearn.model_selection.StratifiedShuffleSplit(n_splits = 1, test_size = 0.01, train_size = 0.99)

In [None]:
for train_idx, test_idx in splitter.split(train_tweets, train_targets):
  continue

In [None]:
count = 0
training_tweets_tmp = []
training_targets_tmp = []
for idx in train_idx:
  training_tweets_tmp += [train_tweets[idx]]
  training_targets_tmp += [train_targets[idx]]
val_tweets_tmp = []
val_targets_tmp = []
for idx in test_idx:
  val_tweets_tmp += [train_tweets[idx]]
  val_targets_tmp += [train_targets[idx]]
val_tweets = val_tweets_tmp
val_targets = val_targets_tmp
train_tweets = training_tweets_tmp
train_targets = training_targets_tmp

In [None]:
def process_tweets(corpus):
  stopwords = nltk.corpus.stopwords.words('english')
  tokenized_corpus = []
  for tweets in tqdm.tqdm(corpus):
    processed_tweets = re.sub(r'[^\w\s]', "", str.lower(tweets))
    tokenized_corpus += [processed_tweets]
  return tokenized_corpus


In [None]:
processed_train = process_tweets(train_tweets)
processed_val = process_tweets(val_tweets)

In [62]:
processed_test = process_tweets(test_tweets)

HBox(children=(FloatProgress(value=0.0, max=3263.0), HTML(value='')))




In [None]:
class TrainDataset(keras.utils.Sequence):
  def __init__(self, tweets, targets, batch_size):
    self.tweets = tweets
    self.targets = targets
    self.batch_size = batch_size
    self.cur_idx = 0
  def __len__(self):
    return len(self.tweets) // self.batch_size
  def __getitem__(self, idx):
    tweets = self.tweets[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
    targets = np.array(self.targets[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size])
    self.cur_idx += 1
    if self.cur_idx >= self.__len__():
      self.cur_idx = 0
    return tweets, targets

In [129]:
class TestDataset(keras.utils.Sequence):
  def __init__(self, tweets, ids, batch_size):
    self.tweets = tweets
    self.ids = ids
    self.batch_size = batch_size
    self.cur_idx = 0
  def reset(self):
    '''
    Resets the current_index
    '''
    self.cur_idx = 0
  def __len__(self):
   return len(self.tweets) // self.batch_size + 1
  def __getitem__(self, idx):
    if self.cur_idx == self.__len__():
      tweets = self.tweets[self.cur_idx * self.batch_size:]
      ids = np.array(self.ids[self.cur_idx * self.batch_size: ])
    else:  
      tweets = self.tweets[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size]
      ids = np.array(self.ids[self.cur_idx * self.batch_size: (self.cur_idx + 1) * self.batch_size])
    self.cur_idx += 1
    if self.cur_idx >= self.__len__():
      print("Iterated through Test Dataset more than once. Uh oh.")
      self.cur_idx = 0
      print("WARNING: Iterating again.")
    return tweets, ids
    



In [130]:
train_dataloader = TrainDataset(processed_train, train_targets, BATCH_SIZE)
test_dataloader = TestDataset(processed_test, test_ids, BATCH_SIZE)

# QANet Transformer From Scratch

In [None]:
class QAConv(keras.layers.Layer):
  def __init__(self, in_features, regularizer):
    super().__init__()
    self.regularizer = regularizer
    self.in_features = in_features
    self.conv = keras.layers.Conv1D(self.in_features, 7, padding = 'same', activation= 'relu', kernel_regularizer = tf.keras.regularizers.l2(self.regularizer))
    self.layer_norm = keras.layers.LayerNormalization()
  def call(self, x):
    return self.conv(self.layer_norm(x)) + x

In [None]:
class MultiHeadAttention(keras.layers.Layer):
  def __init__(self, in_dim, inner_dim, num_heads, regularizer):
    super().__init__()
    self.regularizer = regularizer
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.num_heads = num_heads
    self.K = keras.layers.Dense(self.inner_dim * self.num_heads, kernel_regularizer = tf.keras.regularizers.l2(self.regularizer)) 
    self.V = keras.layers.Dense(self.inner_dim * self.num_heads, kernel_regularizer = tf.keras.regularizers.l2(self.regularizer))
    self.Q = keras.layers.Dense(self.inner_dim * self.num_heads, kernel_regularizer = tf.keras.regularizers.l2(self.regularizer))
    self.Linear = keras.layers.Dense(self.in_dim, kernel_regularizer = tf.keras.regularizers.l2(self.regularizer))
  def call(self, x):
    B, L, _ = x.shape

    Keys = self.K(x)
    Values = self.V(x)
    Queries = self.Q(x) # (B, L, self.inner_dim * self.num_heads)
    # Reshape Tensors 
    Keys = tf.reshape(Keys, (B, L, self.num_heads, self.inner_dim))
    Values = tf.reshape(Values, (B, L, self.num_heads, self.inner_dim))
    Queries = tf.reshape(Queries, (B, L, self.num_heads, self.inner_dim))
    # Transpose Tensors
    Keys = tf.transpose(Keys, perm = (0, 2, 1, 3))
    Values = tf.transpose(Values, perm = (0, 2, 1, 3))
    Queries = tf.transpose(Queries, perm = (0, 2, 1, 3))
    # Reshape Again
    Keys = tf.reshape(Keys, (B * self.num_heads, L, self.inner_dim))
    Values = tf.reshape(Values, (B * self.num_heads, L, self.inner_dim))
    Queries = tf.reshape(Queries, (B * self.num_heads, L, self.inner_dim)) # (BH, L, I)
    # Att Mat Dot Product
    att_mat = tf.keras.activations.softmax(tf.matmul(Keys, tf.transpose(Queries, perm = (0, 2, 1))) / math.sqrt(self.inner_dim))
    att_scores = tf.matmul(att_mat, Values) # (BH, L, I)
    # Reshape Tensors 
    att_scores = tf.reshape(att_scores, (B, self.num_heads, L, self.inner_dim))
    att_scores = tf.transpose(att_scores, perm = (0, 2, 1, 3))
    att_scores = tf.reshape(att_scores, (B, L, self.num_heads * self.inner_dim))
    return self.Linear(att_scores) 

In [None]:
class QANetEncoder(keras.layers.Layer):
  def __init__(self, in_dim, inner_dim, num_heads, num_convs, regularizer, drop_prob = 0.1):
    super().__init__()
    self.regularizer = regularizer
    self.drop_prob = drop_prob
    self.in_dim = in_dim
    self.inner_dim = inner_dim 
    self.num_heads = num_heads
    self.num_convs = num_convs
    # Prepare Convolution Layers 
    self.conv = keras.Sequential([
        QAConv(self.in_dim, self.regularizer) for i in range(self.num_convs)
    ])
    # Prepare MultiHead Attentions
    self.MHA = MultiHeadAttention(self.in_dim, self.inner_dim, self.num_heads, self.regularizer)
    self.MHALayerNorm = keras.layers.LayerNormalization()
    # Prepare Linear Layer
    self.Linear = keras.layers.Dense(self.in_dim, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(self.regularizer))
    self.Dropout = keras.layers.Dropout(self.drop_prob)
    self.LinearLayerNorm = keras.layers.LayerNormalization()
    
  def call(self, x):
    '''
    Run through the attentiion mechanism
    '''
    pos_encoded = self._add_pos_embed(x)
    # Convolve Features 
    convolved = self.conv(pos_encoded)
    # MHA
    attended = self.Dropout(self.MHA(self.MHALayerNorm(convolved)) + convolved)
    # Linear 
    processed = self.Linear(self.LinearLayerNorm(attended)) + attended
    return processed
  def _add_pos_embed(self, x):
    '''
    Adds Positional embeddings to a given tensor
    x: Tensor(B, L, C)
    '''
    B, L, C = x.shape
    pos_embeddings = np.zeros((L, C), dtype = np.float32)
    for pos in range(L):
      for i in range(0, C, 2):
        pos_embeddings[pos, i] = math.sin(pos / 10000 ** (2 * i / self.in_dim))
        pos_embeddings[pos, i + 1] = math.cos(pos / 10000 ** (2 * (i + 1) / self.in_dim))
    # Batch inputs
    batch_pos = []
    for b in range(B):
      batch_pos += [pos_embeddings]
    batch_pos = np.stack(batch_pos)
    return batch_pos + x

In [None]:
class QANet(keras.Model):
  def __init__(self, in_dim, inner_dim, num_heads, num_convs, num_enc, num_classes, drop_prob = 0.3, drop_att = 0.2, regularization = 1e-3):
    super().__init__()
    self.regularization = regularization
    self.drop_prob = drop_prob
    self.drop_att = drop_att
    self.num_classes = num_classes
    self.in_dim = in_dim
    self.inner_dim = inner_dim
    self.num_heads = num_heads
    self.num_convs = num_convs
    self.num_enc = num_enc
    self.tokenizer = transformers.DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    self.embedding = keras.layers.Embedding(self.tokenizer.vocab_size, self.in_dim, embeddings_regularizer= tf.keras.regularizers.l2(self.regularization))
    self.encoders = keras.Sequential([
      QANetEncoder(self.in_dim, self.inner_dim, self.num_heads, self.num_convs, self.regularization, drop_prob = self.drop_att)  for i in range(self.num_enc)
    ])
    self.Dropout = keras.layers.Dropout(self.drop_prob)
    self.Dense = keras.layers.Dense(self.num_classes, kernel_regularizer = tf.keras.regularizers.l2(self.regularization)) 
  def call(self, x):
    tokenized = self.tokenizer(x, return_tensors = 'tf', padding = True, truncation = True, add_special_tokens = False)['input_ids']
    embeddings = self.embedding(tokenized)
    processed = self.encoders(embeddings)
    # Average Logits
    mean = self.Dropout(tf.reduce_mean(processed, axis = 1)) # (B, C)
    return self.Dense(mean)


# Training the Model on GPU
TPU training loop implemented below

In [None]:
model = QANet(256, 128, NUM_ATT_HEADS, 2, 4, 1)

In [None]:
def test_fn(tweets):
  logits = model(tweets, training = False)
  sigmoid = tf.squeeze(tf.keras.activations.sigmoid(logits)).numpy()
  print(f"Logits: {sigmoid}")
  ones = sigmoid >= 0.5
  sigmoid[:] = 0
  sigmoid[ones] = 1
  return sigmoid

In [None]:
def test_loss(tweets, labels):
  logits = model(tweets, training = False)
  return tf.keras.losses.binary_crossentropy(labels, tf.squeeze(logits), from_logits = True)

In [53]:
def train_GPU(NUM_EPOCHS, NUM_STEPS):
  best_val_loss = 9999
  liveloss = livelossplot.PlotLosses()
  optimizer = tf.keras.optimizers.Adam(tf.keras.optimizers.schedules.ExponentialDecay(1e-3, NUM_STEPS, 0.99, staircase = True))
  for EPOCH in range(NUM_EPOCHS):
    logs = {}
    total_loss = 0 
    for STEP in tqdm.tqdm(range(NUM_STEPS)):
      for text, labels in train_dataloader:
        with tf.GradientTape() as tape:
          logits = tf.squeeze(model(text, training = True), axis = 1)
          loss = tf.keras.losses.binary_crossentropy(labels, logits, from_logits = True)
        grads = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        total_loss = total_loss + loss
        break
    logs['loss'] = total_loss / NUM_STEPS
    print(f"EPOCH: {EPOCH}, total_loss: {total_loss / NUM_STEPS}")
    # Evaluate on Train Set(make sure the loss is at least matching.)
    tweets = train_dataloader.tweets[0:TEST_SIZE]
    targets = train_dataloader.targets[0: TEST_SIZE]
    predicted = test_fn(tweets)
    incorrect_train = np.sum((predicted != np.array(targets)).astype(np.int32))
    logs['accuracy'] = incorrect_train
    # Test on Validation Set
    tweets = processed_val[0: TEST_SIZE]
    targets = val_targets[0: TEST_SIZE]
    predicted = test_fn(tweets)
    incorrect = np.sum((predicted != np.array(targets)).astype(np.int32))
    val_loss = test_loss(tweets, targets)
    if val_loss < best_val_loss:
      model.save_weights("./BestModel/model")
    logs['val_loss'] = val_loss
    logs['val_accuracy'] = incorrect # We want this to decrease
    liveloss.update(logs)
    liveloss.send()

Train the Model



In [None]:
train_GPU(100, 100)

In [58]:
model.load_weights("./BestModel/model")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f96e0874e10>

Evaluate and Predict using the model

In [143]:
def predict(tweets):
  logits = model(tweets, training = False)
  sigmoid = tf.squeeze(tf.keras.activations.sigmoid(logits)).numpy()
  ones = sigmoid >= 0.5
  sigmoid[:] = 0
  sigmoid[ones] = 1
  return sigmoid.astype(np.int32)

In [144]:
def make_predictions(test_dataloader):
  predicted = {'id': [], 'target': []}
  for tweets, ids in tqdm.tqdm(test_dataloader):
    logits = predict(tweets)
    predicted['id'] += ids.tolist()
    predicted['target'] += logits.tolist()
  return predicted

In [145]:
predictions = make_predictions(test_dataloader)

HBox(children=(FloatProgress(value=0.0, max=204.0), HTML(value='')))

Iterated through Test Dataset more than once. Uh oh.



In [146]:
dataframe = pd.DataFrame(predictions)

In [149]:
dataframe.to_csv("./submission.csv", index_label = 'id', index = False)

In [85]:
len(test_dataloader.tweets)

3263

Train the Model on TPU 

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu_cluster_resolver = tpu)

In [None]:
%%capture
with strategy.scope():
  model = DistilBert(1)
  optimizer = tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE)
  training_loss = tf.keras.metrics.Mean(name = "training_loop")

In [None]:
@tf.function
def training_step():
  def step():
    tweets = train_dataloader.tweets[0:32]
    labels = train_dataloader.targets[0:32]
    with tf.GradientTape() as tape:
      logits = model(tweets, training = True)
      loss = tf.keras.losses.binary_crossentropy(labels, tf.squeeze(logits), from_logits = True)
    grads = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss
  return strategy.run(step)

In [None]:
@tf.function
def test_loss():
  for tweets, labels in train_dataloader:
    break
  logits = model(tweets, training = False)
  loss = tf.keras.losses.binary_crossentropy(labels, tf.squeeze(logits), from_logits = True)
  return loss

In [None]:
def training_loop(NUM_EPOCHS, NUM_STEPS):
  liveloss = livelossplot.PlotLosses()
  for EPOCH in tqdm.tqdm(range(NUM_EPOCHS)):
    logs = {}
    total_loss = 0.0
    for STEP in range(NUM_STEPS):
      loss = training_step();
      for i in loss.values:
        total_loss = total_loss + i
    # Test the Model on the training set
    loss = strategy.run(test_loss)
    print(f"EPOCH: {EPOCH}, total_loss: {loss.values[0]}")
    logs['loss'] = loss.values[0]
    # Evaluate Model on Validation
    predicted_logits = test_fn(val_tweets)
    # Compute AVG number of incorrect predictions
    print(f"Ground Truths: {np.array(val_targets)}")
    print(f"Predicted Vals: {predicted_logits}")
    incorrect_predictions = np.sum((predicted_logits != np.array(val_targets)).astype(np.int32)) / predicted_logits.shape[0]
    print(f"incorrect_predictions: {incorrect_predictions}")
    logs['accuracy'] = incorrect_predictions # Not actually accuracy, but livelossplot only supports 2 keywords, so we want this value to go down.
    liveloss.update(logs)
    liveloss.send()

In [None]:
training_loop(200, 500)