In [None]:
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
#input_ids are the numeric representations of the tokens.
#Attention_mask is useful when we add padding to the input tokens. The attention mask tells which input_ids correspond to padding. 
#Padding is added because all the input sentences to be of the same length (at least for a batch) so that we are able to form tensor objects properly. 
#Use tokenizer.encode_plus function for obtaining input_ids, attention_mask

def encode(data, tokenizer, maxlength):
    input_ids = []
    attention_mask = []
    for text in data:
        tokenized_text = tokenizer.encode_plus(text,
                                               truncation=True,
                                               max_length=maxlength,
                                               add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                               pad_to_max_length=True,
                                               return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_mask.append(tokenized_text['attention_mask'])
    
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [None]:
def get_batches(df, tokenizer, maxlength, batchsize):
  x = list(df['Tweet'].values)
  y_indices = df['Stance'].apply(lambda each_y: dataset['labels'].index(each_y))
  y = torch.tensor(list(y_indices), dtype=torch.long)
  input_ids, attention_mask = encode(x, tokenizer,maxlength)
  tensor_dataset = TensorDataset(input_ids, attention_mask, y)
  tensor_randomsampler = RandomSampler(tensor_dataset)
  tensor_dataloader = DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batchsize)
  return tensor_dataloader

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def evaluation_summary(description, true_labels, predictions, target_classes):
  print("Evaluation for: " + description)
  print(classification_report(true_labels, predictions,  digits=3, zero_division=0, target_names=target_classes))

In [None]:
def build_optimizer(model,optimizer, learning_rate,epsilon,weight_decay ):
  if optimizer == "adam":
    optimizer = transformers.AdamW(
        model.parameters(),
        lr = learning_rate, 
        correct_bias=False,
        weight_decay=weight_decay,
        eps = epsilon)
  return optimizer

In [None]:
def build_scheduler(model,optimizer,epochs,train_batch_len):
    tr_steps = train_batch_len * epochs
    num_warmup_steps = int(0.1*tr_steps)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer=optimizer, 
        num_training_steps = tr_steps,
        num_warmup_steps = num_warmup_steps
    )
    return scheduler

In [None]:
def build_model(model_name,label_count):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=label_count, 
        output_hidden_states=True, 
        output_attentions=True)
    return model.to(device)

In [None]:
def train_model(batch, val_batch,model, optimizer, scheduler, epochs, device,max_grad_norm):
    train_loss_arr = []
    val_loss = []
    val_accuracy = []
    training_stats = []

     # Early stopping
    patience = 5
    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    
    for e in range(epochs):
        t0 = time.time()
        print("\nepoch : ", e+1)
        print("batch size : ",len(batch))
        running_loss = 0.0
        model.train()  # Set the mode to training
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            running_loss += outputs.loss.item() 
            if i % 500 == 0:
                print("Training loss - {0}, iteration - {1}/{2}".format(outputs.loss, e + 1, i))
            model.zero_grad()
            optimizer.zero_grad()
            outputs.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),max_grad_norm)
            optimizer.step()
            scheduler.step()
        avg_train_loss = running_loss/len(batch)
        train_loss_arr.append(avg_train_loss)
        training_time = format_time(time.time() - t0)

        model.eval()
        total_eval_loss = 0
        total_eval_accuracy = 0
        # Evaluate data for one epoch
        for i, batch_cpu in enumerate(val_batch):
            batch_gpu = (t.to(device) for t in batch_cpu)
            input_ids_gpu, attention_mask, labels = batch_gpu
            with torch.no_grad():
                outputs = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            total_eval_loss += outputs.loss.item()
            if i % 100 == 0:
                print("Validation loss - {0}, iteration - {1}/{2}".format(outputs.loss, e + 1, i))
            logits =  outputs.logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
            #wandb.log({"Validation Running Loss": outputs.loss.item()})
        avg_val_accuracy = total_eval_accuracy / len(val_batch)
        val_accuracy.append(avg_val_accuracy)
        avg_val_loss = total_eval_loss/len(val_batch)
        val_loss.append(avg_val_loss)
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        # early_stopping needs the validation loss to check if it has decresed, and if it has, it will make a checkpoint of the current model
        early_stopping(avg_val_loss, model)
        
        if early_stopping.early_stop:
            print("Early stopping")
            break

        training_stats.append(
              {
                'epoch': e + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
              }
       )
    # load the last checkpoint with the best model
    model.load_state_dict(torch.load('checkpoint.pt'))

    return train_loss_arr,val_loss,val_accuracy,training_stats,model

In [None]:
def prediction(batch, model, device):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        running_loss = 0.0
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            outputs = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            logits =  outputs.logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            attention_last_layer = outputs.attentions[-1].cpu() # selection the last attention layer
            attention_softmax = attention_last_layer[:,-1, 0].tolist()  # selection the last head attention of CLS token
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions

In [None]:
nltk.download('stopwords')
stopword_es = nltk.corpus.stopwords.words('spanish')
stopword_it = nltk.corpus.stopwords.words('italian')
stopword_fr = nltk.corpus.stopwords.words('french')
stopword_en = nltk.corpus.stopwords.words('english')
stopword_ca = nltk.corpus.stopwords.words('catalan')

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

def clean(text,lang):
    #remove puctuation
    text = REPLACE_NO_SPACE.sub("", text.lower()) # convert all tweets to lower cases
    text = REPLACE_WITH_SPACE.sub(" ", text)

    if lang=='EN':
      stopword_lst = stopword_en
    elif lang=='IT':
      stopword_lst = stopword_it
    elif lang=='FR':
      stopword_lst = stopword_fr
    elif lang=='CA':
      stopword_lst = stopword_ca
    elif lang=='SP':
      stopword_lst = stopword_es

    " ".join([word for word in str(text).split() if word not in stopword_lst])
    return text

def clean_mul(row):
    #remove puctuation
    text = row['Tweet']
    lang = row['Language']
    text = REPLACE_NO_SPACE.sub("", text.lower()) # convert all tweets to lower cases
    text = REPLACE_WITH_SPACE.sub(" ", text)

    if lang=='EN':
      stopword_lst = stopword_en
    elif lang=='IT':
      stopword_lst = stopword_it
    elif lang=='FR':
      stopword_lst = stopword_fr
    elif lang=='CA':
      stopword_lst = stopword_ca
    elif lang=='SP':
      stopword_lst = stopword_es

    " ".join([word for word in str(text).split() if word not in stopword_lst])
    return text 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def MUSE(train,val,test):
    X_train = []
    for r in tqdm(train.Tweet.values):
        emb = use(r)
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_train.append(review_emb)

    X_train = np.array(X_train)
    y_train = train.Stance.values


    X_val = []
    for r in tqdm(val.Tweet.values):
        emb = use(r)
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_val.append(review_emb)

    X_val = np.array(X_val)
    y_val = val.Stance.values

    X_test = []
    for r in tqdm(test.Tweet.values):
        emb = use(r)
        review_emb = tf.reshape(emb, [-1]).numpy()
        X_test.append(review_emb)

    X_test = np.array(X_test)
    y_test = test.Stance.values
    return X_train,y_train,X_val,y_val,X_test,y_test

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [None]:
def f1score(truelabel, prediction):
  print("Micro Average Precision : ",precision_score(truelabel, prediction, average='micro')) 
  print("Micro Average Recall : ",recall_score(truelabel, prediction, average='micro'))  
  print("Micro Average F1-Score : ",f1_score(truelabel, prediction, average='micro')) 
  print("............................")
  print("Macro Average Precision : ",precision_score(truelabel, prediction, average='macro')) 
  print("Macro Average Recall : ",recall_score(truelabel, prediction, average='macro'))  
  print("Macro Average F1-Score : ",f1_score(truelabel, prediction, average='macro')) 

In [None]:
def plotEarlyStopCheckpoint(train_loss_arr,val_loss,imgName):
  # visualize the loss as the network trained
  fig = plt.figure(figsize=(8,5))
  plt.plot(range(1,len(train_loss_arr)+1),train_loss_arr, label='Training Loss')
  plt.plot(range(1,len(val_loss)+1),val_loss,label='Validation Loss')

  # find position of lowest validation loss
  minposs = val_loss.index(min(val_loss))+1 
  plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')

  plt.xlabel('epochs')
  plt.ylabel('loss')
  plt.ylim(0, 1.5) # consistent scale
  plt.xlim(0, len(train_loss_arr)+1) # consistent scale
  plt.grid(True)
  plt.legend()
  plt.tight_layout()
  plt.show()
  fig.savefig(imgName, bbox_inches='tight')

In [None]:
def lossCurve(train_loss_arr,val_loss,imgName):
  epoch_range = range(1, len(train_loss_arr) + 1)
  plt.figure(figsize=(10,5))
  plt.plot(epoch_range,train_loss_arr, '-o', label="train")
  plt.plot(epoch_range,val_loss, '-o', label="validation")
  plt.xlabel("Epoch")
  plt.ylabel("Loss")
  plt.title("Loss change over epoch")
  plt.legend()
  plt.show()
  fig.savefig(imgName, bbox_inches='tight')

In [None]:
def accuracyCurve(train_loss_arr,val_accuracy,imgName):
  epoch_range = range(1, len(train_loss_arr) + 1)
  plt.figure(figsize=(10,5))
  plt.plot(epoch_range,val_accuracy, '-o', label="validation")
  plt.xlabel("Epoch")
  plt.ylabel("Accuracy")
  plt.title("Accuracy over epoch")
  plt.legend()
  plt.show()
  fig.savefig(imgName, bbox_inches='tight')

In [None]:
def timer(training_stats):
  # Display floats with two decimal places.
  pd.set_option('precision', 2)

  # Create a DataFrame from our training statistics.
  df_stats = pd.DataFrame(data=training_stats)

  # Use the 'epoch' as the row index.
  df_stats = df_stats.set_index('epoch')

  # A hack to force the column headers to wrap.
  #df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

  # Display the table.
  print(df_stats)

In [None]:
def roc(true_labels, predictions,label):
  fpr, tpr, _ = roc_curve(true_labels, predictions, pos_label=label)
  roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
def gridsearch(prediction_pipeline,parameter,train,Y_train):
    #Tuning paramters to improve model performance
    grid_search = GridSearchCV(prediction_pipeline, param_grid=parameter, n_jobs=1, verbose=1, scoring='f1_weighted', cv=5)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in prediction_pipeline.steps])
    print("parameters:")
    pprint(parameter)
    t0 = time()
    grid_search.fit(train, Y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameter.keys()):
      print("\t%s: %r" % (param_name, best_parameters[param_name]))