# Installing packages

In [None]:
!pip install -q pandas
!pip install --user -q torch
!pip install -q transformers
!pip install --user -q pytest 
!pip install --user -q tqdm
!pip install gdown

***Warning***: Depending on the runtime used, you might have to restart the kernel in order for the new libraries to be located properly.

# Import libraries

In [None]:
import itertools
import functools as fu
import pandas as pd
import numpy as np
import h5py
from tqdm import tqdm
from pathlib import Path
import pickle
import tensorflow as tf
import tensorflow.keras as keras
import torch
import transformers
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig
from transformers import AutoModel, AutoTokenizer
from tensorflow.keras.layers import Dense,Dropout, Input
from tensorflow.keras import regularizers

Download tokenized tweets from google drive. If the automatic download fails, please copy-paste the link in the browser, download them, and upload them manually.

In [None]:
import gdown
files={
    "attention_masks.pkl": "https://drive.google.com/uc?id=1xxv8TWjycS3xRt-9_QrjuGeCsn5lMjge",
    "sentiments_encoded.pkl": "https://drive.google.com/uc?id=1W4bPBy0AO9RKVa1dTiUwp7KIi3LHTIcQ",
    "token_types.pkl": "https://drive.google.com/uc?id=17aTXUlBWWA7eCy2lb7Zwg7APoe0VVqyY",
    "tweets_encoded.pkl": "https://drive.google.com/uc?id=1ZKrDAtIIf5nXeRkAQ9o0wThwwJMPGylX",
}
for fname, url in files.items():
    gdown.download(url, fname)

# Load the tokenized tweets

In [None]:
def load_from_pickle():
    with open('tweets_encoded.pkl', 'rb') as f:
        tweets_encoded = pickle.load(f)
        print("Loaded tweets_encoded")
    with open('attention_masks.pkl', 'rb') as f:
        attention_masks = pickle.load(f)
        print("Loaded attention_masks")
    with open('token_types.pkl', 'rb') as f:
        token_types = pickle.load(f)
        print("Loaded token_types")
    with open('sentiments_encoded.pkl', 'rb') as f:
        sentiments_encoded = pickle.load(f)
        print("Loaded sentiments_encoded")

    return tweets_encoded, attention_masks, token_types,sentiments_encoded 

print("Pickle!")
all_tweets_encoded, all_attention_masks, all_token_types, all_sentiments_encoded = load_from_pickle()
tweets_encoded, attention_masks, token_types, sentiments_encoded = all_tweets_encoded, all_attention_masks, all_token_types, all_sentiments_encoded
tweets, sentiments = [tweets_encoded, attention_masks], sentiments_encoded 

# Define the number of GPU's and batch size you can support


In [None]:
max_length = 512
num_gpu = 8
batch_size = 30 * num_gpu

EPOCHS=10

# Helper function

In [None]:
def map_example_to_dict(input_ids, attention_masks, label):
  return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
  }, label

# Make train, validation, and test data
Here we have: 90% train, 5% validation, and 5% test.

In [None]:
def make_train_data(start_tweets, start_sentiments):

      tweets = start_tweets
      sentiments =  start_sentiments


      ratio = 0.9
        
      print(start_sentiments[:10])
      print(start_sentiments[-10:])
        
      pos, neg = sentiments[: sentiments.count(1)], sentiments[- sentiments.count(0):]
        
      np.random.seed(10)  
      msk1 = np.random.rand(len(pos)) < ratio
      np.random.seed(11)  
      msk2 = np.random.rand(len(neg)) < ratio

      train_tweets = [[], []]
      train_sentiments = []

      validation_tweets = [[], []]
      validation_sentiments = []

      test_tweets = [[], []]
      test_sentiments = []

      rest_tweets = [[], []] 
      rest_sentiments = []

      for i in range(len(pos)):
        if msk1[i] == True:
          train_tweets[0].append(tweets[0][i]) 
          train_tweets[1].append(tweets[1][i]) 
          train_sentiments.append(sentiments[i])
        else:
          rest_tweets[0].append(tweets[0][i]) 
          rest_tweets[1].append(tweets[1][i]) 
          rest_sentiments.append(sentiments[i])
            
      for i in range(len(neg)):
        if msk2[i] == True:
          train_tweets[0].append(tweets[0][len(pos) + i])
          train_tweets[1].append(tweets[1][len(pos) +i])  
          train_sentiments.append(sentiments[len(pos) +i])
        else:
          rest_tweets[0].append(tweets[0][len(pos) +i])
          rest_tweets[1].append(tweets[1][len(pos) +i])
          rest_sentiments.append(sentiments[len(pos) +i])


      rest_ratio = 0.5
      print(rest_sentiments[:10])
      print(rest_sentiments[-10:])
    
      pos, neg = rest_sentiments[: rest_sentiments.count(1)], rest_sentiments[- rest_sentiments.count(0):]
      np.random.seed(10)
      msk1 = np.random.rand(len(pos)) < rest_ratio
      np.random.seed(11)  
      msk2 = np.random.rand(len(neg)) < rest_ratio


      for i in range(len(pos)):
        if msk1[i] == True:
          validation_tweets[0].append(rest_tweets[0][i])
          validation_tweets[1].append(rest_tweets[1][i])
          validation_sentiments.append(rest_sentiments[i])
        else:
          test_tweets[0].append(rest_tweets[0][i]) 
          test_tweets[1].append(rest_tweets[1][i]) 
          test_sentiments.append(rest_sentiments[i])

      for i in range(len(neg)):
        if msk2[i] == True:
          validation_tweets[0].append(rest_tweets[0][len(pos) + i]) 
          validation_tweets[1].append(rest_tweets[1][len(pos) + i]) 
          validation_sentiments.append(rest_sentiments[len(pos) + i])
        else:
          test_tweets[0].append(rest_tweets[0][len(pos) + i]) 
          test_tweets[1].append(rest_tweets[1][len(pos) + i])  
          test_sentiments.append(rest_sentiments[len(pos) + i])
            
      print("I have: Train:"+str(len(train_sentiments)) + " Validation:" +str(len(validation_sentiments)) + " Test:" +str(len(test_sentiments))) 

      train_tweets_ds = tf.data.Dataset.from_tensor_slices((train_tweets[0], train_tweets[1], train_sentiments)).map(map_example_to_dict).shuffle(len(train_sentiments))
      print("Train loaded")
      validation_tweets_ds = tf.data.Dataset.from_tensor_slices((validation_tweets[0], validation_tweets[1], validation_sentiments)).map(map_example_to_dict).shuffle(len(validation_sentiments))
      print("Validation loaded")
      test_tweets_ds = tf.data.Dataset.from_tensor_slices((test_tweets[0], test_tweets[1],test_sentiments)).map(map_example_to_dict)
      print("Test loaded")
      return (train_tweets_ds, train_sentiments), (validation_tweets_ds, validation_sentiments), (test_tweets_ds, test_sentiments)
    
(ds_train_encoded_unb,ts), (ds_val_encoded_unb, vs), (ds_test_encoded_unb, tss) = make_train_data(tweets, sentiments) 

# Batch the data
You can tweak the batch size by modifying the batch size a few cells above.

In [None]:
ds_train_encoded = ds_train_encoded_unb.batch(batch_size)
ds_test_encoded = ds_test_encoded_unb.batch(batch_size)
ds_val_encoded = ds_val_encoded_unb.batch(batch_size)

# Create the classifier which includes the DistilBERT model

In [None]:
def create_model():
    num_classes = 2
    dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
    inps = Input(shape = (max_length,), dtype='int64', name = 'input_ids')
    masks= Input(shape = (max_length,), dtype='int64', name = 'attention_mask')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    return model  

# Define strategy for training in parallel

In [None]:
tf.debugging.set_log_device_placement(True)
gpus = tf.config.list_logical_devices('GPU')
if len(gpus) > 1:
    strategy = tf.distribute.MirroredStrategy(gpus)
    with strategy.scope():
        model = create_model()
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

        model.compile(loss=loss,optimizer=optimizer, metrics=[metric])


In [None]:
model.summary()

# Fine-tune the model
Eearly stopping to prevent overfitting.

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience= 3, restore_best_weights= True);
bert_history = model.fit(ds_train_encoded, epochs=EPOCHS, validation_data=ds_val_encoded, callbacks=[stop_early,])

# Save the model

In [None]:
model.save_weights("dbert_b30.h5")

In [None]:
loaded_model = create_model()

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

loaded_model.load_weights("dbert_b30.h5")

loaded_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Evaluate the model

In [None]:
loaded_model.evaluate(ds_test_encoded)

In [None]:
model.evaluate(ds_test_encoded)