Code adapted from:
* https://www.youtube.com/watch?v=U51ranzJBpY [ TOKENISER ]
* https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch [ TRAINING ]
* https://www.kaggle.com/abhishek/roberta-inference-5-folds [ INFERENCE ] 
* https://www.kaggle.com/masterscrat/detect-if-notebook-is-running-interactively [ CHECK WHERE NOTEBOOK IS RUNNING ]
* https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/141502 [ SUBMISSION ]

Notes 
This model is a basic implementation of XLNet 0508_1 but  
* Run with train_folds_2.csv, based on updated input data (June 2nd)
* Seeding of random number generator
* Generates prediction for training data

In [1]:
!pip install transformers
!pip install tokenizers
!pip install protobuf



In [2]:
try:
    from google.colab import drive
    IN_COLAB = True
    drive.mount('/content/drive')
    !wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py
    !wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model



except:
    IN_COLAB = False
    
    import sys
    sys.path.append('/kaggle/input/sentencepiece-pb2/')

## Import library

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd
import os
import tokenizers
import string
import torch
import transformers
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import re
import sentencepiece as spm
import sentencepiece_pb2
import random
from sklearn import model_selection
import gc

In [4]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [5]:
class SentencePieceTokenizer:
    def __init__(self, model_name):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(model_name)
    
    def encode(self, sentence):
        spt = sentencepiece_pb2.SentencePieceText()
        spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
        offsets = []
        ids = []
        for piece in spt.pieces:
            ids.append(piece.id)
            offsets.append((piece.begin, piece.end))
        return {'ids' : ids,
                'offsets' : offsets}

In [6]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32 #64
    VALID_BATCH_SIZE =  16
    EPOCHS = 10
    
    MODEL_CONFIG = transformers.XLNetConfig
    MODEL = transformers.XLNetForQuestionAnswering
    if IN_COLAB:
        
        BASE_PATH = Path.cwd() / "drive" / "My Drive" / "kaggle" / "tweet_sentiment_extraction"
        PRETRAINED_MODEL_DIR = BASE_PATH / "input" / "xlnetbasecased"
        TOKENIZER = SentencePieceTokenizer(str(PRETRAINED_MODEL_DIR / 'xlnet-base-cased-spiece.model'))
        MODEL_PATH = BASE_PATH  / "model_save" / "model_0602_2"
        FOLDED_TRAINING_FILE = BASE_PATH / "input" / "train-5fold" / "train_folds.csv"
        TRAINING_FILE = BASE_PATH / "input" / "train.csv"
        TESTING_FILE = BASE_PATH  / "input" / "test.csv"
        SAMPLE_SUBMISSION_FILE = BASE_PATH / "input" / "sample_submission.csv"
        SUBMISSION_FILE = BASE_PATH / "input" / "submission.csv"
    else:
        BASE_PATH = Path('/kaggle')
        PRETRAINED_MODEL_DIR = BASE_PATH / "input" / "xlnetbasecased"
        TOKENIZER = SentencePieceTokenizer( str(PRETRAINED_MODEL_DIR / "xlnet-base-cased-spiece.model"))
        MODEL_PATH = BASE_PATH  / "input" / "xlnetmodel06022"
        FOLDED_TRAINING_FILE = BASE_PATH / "working" / "train_folds.csv"
        TRAINING_FILE = BASE_PATH  / "input" / "tweet-sentiment-extraction" / "train.csv"
        TESTING_FILE = BASE_PATH  / "input" / "tweet-sentiment-extraction" / "test.csv"
        SAMPLE_SUBMISSION_FILE = BASE_PATH / "input" / "tweet-sentiment-extraction" / "sample_submission.csv"
        SUBMISSION_FILE = BASE_PATH / "working" / "submission.csv"

In [7]:
[config.TOKENIZER.sp.id_to_piece(x) for x in range(0,10)]

['<unk>',
 '<s>',
 '</s>',
 '<cls>',
 '<sep>',
 '<pad>',
 '<mask>',
 '<eod>',
 '<eop>',
 '.']

In [8]:
[config.TOKENIZER.sp.piece_to_id(x) for x in ['positive', 'negative', 'neutral']]

[19036, 25976, 24734]

In [9]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

In [10]:
def create_train_folds():
    df = pd.read_csv(config.TRAINING_FILE)
    df = df.dropna().reset_index(drop=True)
    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.StratifiedKFold(n_splits=5, random_state=seed)

    for fold, (trn_, val_) in enumerate(kf.split(X=df, y=df.sentiment.values)):
        print(len(trn_), len(val_))
        df.loc[val_, 'kfold'] = fold

    df.to_csv(config.FOLDED_TRAINING_FILE, index=False)

create_train_folds()



21984 5496
21984 5496
21984 5496
21984 5496
21984 5496


## Utils

In [11]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


class EarlyStopping:
    # https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, name):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, name):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), name)
        self.val_loss_min = val_loss

## Data processing

In [12]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    """
    Preprocessing the data to the XLNet model formatting
    """
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    # find start and indices of selected_text in tweet
    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    # create character mask for selected_text in tweet
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1

    # replace unicode replacement character with "***" so that offset values are returned correctly
    tok_tweet = tokenizer.encode(re.sub(r'(Â¡)', 'Ai', re.sub(r'(ï¿½)', "@@@", tweet)))
    
    input_ids_orig = tok_tweet['ids']
    tweet_offsets = tok_tweet['offsets']
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    #######
    sentiment_id = {
        'positive': 19036,
        'negative': 25976,
        'neutral': 24734
    }
    #######
    
    # https://huggingface.co/transformers/model_doc/xlnet.html#transformers.XLNetTokenizer.build_inputs_with_special_tokens
    input_ids = [sentiment_id[sentiment]] + [4] + input_ids_orig + [4] + [3]
    #input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0]*2 + [1] * (len(input_ids_orig)+1) + [2]
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 2 + tweet_offsets + [(0, 0)] * 2
    targets_start += 2
    targets_end += 2

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([5] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

## Data loader

In [13]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

## Loss function

In [14]:
# def loss_fn(start_logits, end_logits, start_positions, end_positions):
#     loss_fct = nn.CrossEntropyLoss()
#     start_loss = loss_fct(start_logits, start_positions)
#     end_loss = loss_fct(end_logits, end_positions)
#     total_loss = (start_loss + end_loss)
#     return total_loss

def loss_fn(start_logprobs, end_logprobs, start_positions, end_positions):
    loss_fct = nn.NLLLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss
    

## Training function

In [15]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        
        # outputs_start, outputs_end = model(
        #     ids=ids,
        #     mask=mask,
        #     token_type_ids=token_type_ids,
        # )
        # loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        # loss.backward()

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            start_positions=targets_start, 
            end_positions=targets_end
        )
        
        loss = outputs[0]
        loss.backward()

        optimizer.step()
        scheduler.step()

        # outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        # outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        
        # jaccard_scores = []
        # for px, tweet in enumerate(orig_tweet):
        #     selected_tweet = orig_selected[px]
        #     tweet_sentiment = sentiment[px]
        #     jaccard_score, _ = calculate_jaccard_score(
        #         original_tweet=tweet,
        #         target_string=selected_tweet,
        #         sentiment_val=tweet_sentiment,
        #         idx_start=np.argmax(outputs_start[px, :]),
        #         idx_end=np.argmax(outputs_end[px, :]),
        #         offsets=offsets[px]
        #     )
        #     jaccard_scores.append(jaccard_score)

        # jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)#, jaccard=jaccards.avg)

## Evaluation function

In [16]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]

        # add spacing to output
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    if sentiment_val != "neutral" and verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {original_tweet.strip()}")
            print("********************************")

    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids,
                start_positions=targets_start,
                end_positions=targets_end
            )
            loss = outputs[0]
            
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )
            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
            start_top_probs = outputs[0]  
          
            # start_top_probs contain those positions' probabilities
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
            start_top_index = outputs[1] 

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 
            
            # calculate joint probability of start, end position tuples
            start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

            # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
            mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
            _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

            start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                      indices]

            # get (flat) position in sequence of highest probability tuple
            top_start_end_probs_sorted = start_end_probs_sorted.argmax(dim=1)

            # convert flat position to separate start and end positions
            start_top_positions = (top_start_end_probs_sorted % torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
            end_top_positions = (top_start_end_probs_sorted // torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[px]
                tweet_sentiment = sentiment[px]
                start_top_position = start_top_positions[px]
                end_top_position = end_top_positions[px]
            
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=start_top_position,
                    idx_end=end_top_position,
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    print(f"Loss = {losses.avg}")
    return jaccards.avg, losses.avg

## Training 

In [17]:
def init_model(config):
    model_config = config.MODEL_CONFIG.from_pretrained(config.PRETRAINED_MODEL_DIR )#/ "config.json")
    model_config.output_hidden_states = True
    model_config.start_n_top = config.MAX_LEN
    model_config.end_n_top = config.MAX_LEN
    #'/kaggle/input/xlnet-base-tf/xlnet-base-cased'
    model = config.MODEL.from_pretrained(config.PRETRAINED_MODEL_DIR, config=model_config)#, state_dict='/kaggle/input/xlnetmodel05081/model_3.bin')
    
    return model

In [18]:
def run_fold(fold):

    dfx = pd.read_csv(config.FOLDED_TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
    
    device = torch.device("cuda")

    # initialise model
    model = init_model(config)
    
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    es = EarlyStopping(patience=2, verbose=True)
    print(f"Training is Starting for fold={fold}")
    
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard, loss = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        print(f"Loss score = {loss}")
        es(loss, model, name=config.MODEL_PATH / f"model_{fold}.bin")
        
        if es.early_stop:
            print("Early stopping")
            break
  
    return es.val_loss_min

## Run training

In [19]:
def run_training():
  if not os.path.exists(config.MODEL_PATH):
    os.mkdir(config.MODEL_PATH)
  val_loss = []
  for ifold in range(2,5):
      q = run_fold(ifold)
      val_loss.append(q)
  print(f'Mean val loss: {np.mean(val_loss)}')

## Predict test set

In [20]:
def predict_test():
  df_test = pd.read_csv(config.TESTING_FILE)
  df_test.loc[:, "selected_text"] = df_test.text.values

  models = []

  for mf in os.listdir(config.MODEL_PATH):
    m = init_model(config)
    
    m.load_state_dict(torch.load(config.MODEL_PATH / mf))
    print(config.MODEL_PATH / mf)
    m.eval()
    # ensure we get output probabilities for all combinations of start and end position
    m.start_n_top = config.MAX_LEN
    m.end_n_top = config.MAX_LEN
    m.to(device)

    models.append(m)

  test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.selected_text.values
      )

  test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
  )

  final_output = []

  with torch.no_grad():
      tk0 = tqdm(test_data_loader, total=len(test_data_loader))
      for bi, d in enumerate(tk0):
          ids = d["ids"]
          token_type_ids = d["token_type_ids"]
          mask = d["mask"]
          sentiment = d["sentiment"]
          orig_selected = d["orig_selected"]
          orig_tweet = d["orig_tweet"]
          targets_start = d["targets_start"]
          targets_end = d["targets_end"]
          offsets = d["offsets"].numpy()

          ids = ids.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          targets_start = targets_start.to(device, dtype=torch.long)
          targets_end = targets_end.to(device, dtype=torch.long)
          
          summed_start_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN*config.MAX_LEN).to(device)

          for model in models: 
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )

            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
            start_top_probs = outputs[0]  
          
            # start_top_probs contain those positions' probabilities
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
            start_top_index = outputs[1] 

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 
            
            # calculate joint probability of start, end position tuples
            start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

            # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
            mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
            _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

            #start_end_probs_sorted = start_end_probs[torch.arange(start_end_probs.shape[0]), indices]
            start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                      indices]

            summed_start_end_probs_sorted += start_end_probs_sorted

          avg_start_end_probs_sorted = summed_start_end_probs_sorted/torch.tensor(len(models))

          # get (flat) position in sequence of highest probability tuple
          top_avg_start_end_probs_sorted = avg_start_end_probs_sorted.argmax(dim=1)

          # convert flat position to separate start and end positions
          start_top_positions = (top_avg_start_end_probs_sorted % torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          end_top_positions = (top_avg_start_end_probs_sorted // torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          
          jaccard_scores = []
          for px, tweet in enumerate(orig_tweet):
              selected_tweet = orig_selected[px]
              tweet_sentiment = sentiment[px]
              _, output_sentence = calculate_jaccard_score(
                  original_tweet=tweet,
                  target_string=selected_tweet,
                  sentiment_val=tweet_sentiment,
                  idx_start=start_top_positions[px],
                  idx_end=end_top_positions[px],
                  offsets=offsets[px],
                  verbose=True
              )
              final_output.append(output_sentence)


  sample = pd.read_csv(config.SAMPLE_SUBMISSION_FILE)
  sample.loc[:, 'selected_text'] = final_output
  sample.to_csv("predictions_voting.csv", index=False)


In [21]:
def predict_test_for_voting():
    
    df_test = pd.read_csv(config.TESTING_FILE)
    df_test.loc[:, "selected_text"] = df_test.text.values

    test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.selected_text.values
      )

    test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
    )
    
    preds_df = df_test.loc[:, ['textID']]
    
    for mf in os.listdir(config.MODEL_PATH):
        if not mf.endswith('.bin'):
            continue
            
        model = init_model(config)
        model.load_state_dict(torch.load(config.MODEL_PATH / mf))
        print(config.MODEL_PATH / mf)

        model.eval()
        # ensure we get output probabilities for all combinations of start and end position
        model.start_n_top = config.MAX_LEN
        model.end_n_top = config.MAX_LEN
        model.to(device)
        
        final_output = []
        
        with torch.no_grad():

            tk0 = tqdm(test_data_loader, total=len(test_data_loader))

            for bi, d in enumerate(tk0):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                sentiment = d["sentiment"]
                orig_selected = d["orig_selected"]
                orig_tweet = d["orig_tweet"]
                targets_start = d["targets_start"]
                targets_end = d["targets_end"]
                offsets = d["offsets"].numpy()

                ids = ids.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets_start = targets_start.to(device, dtype=torch.long)
                targets_end = targets_end.to(device, dtype=torch.long)

                summed_start_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN*config.MAX_LEN).to(device)

                # run it again to get the probabilities
                # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
                outputs = model(
                    input_ids=ids,
                    attention_mask=mask,
                    token_type_ids=token_type_ids
                )

                # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
                sorted_start_probs = outputs[0]  

                # start_top_probs contain those positions' probabilities
                # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
                sorted_start_index = outputs[1] 

                # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
                # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
                # this can be verified by checking summation to unity
                sorted_end_probs = outputs[2] 
                sorted_end_index = outputs[3] 

                # calculate joint probability of start, end position tuples
                sorted_joint_probs = (sorted_start_probs.repeat(1, model.end_n_top)*sorted_end_probs)
                top_joint_index = sorted_joint_probs.argmax(dim=1)
                
                # convert flat position to separate start and end positions
                top_end_index = sorted_end_index[torch.arange(sorted_end_index.shape[0]), top_joint_index]
                top_start_index = sorted_start_index[torch.arange(sorted_start_index.shape[0]), top_joint_index % torch.tensor(config.MAX_LEN).to(device)]
                
                for px, tweet in enumerate(orig_tweet):
                    _, output_sentence = calculate_jaccard_score(
                        original_tweet=tweet,
                        target_string=orig_selected[px],
                        sentiment_val=sentiment[px],
                        idx_start=top_start_index[px],
                        idx_end=top_end_index[px],
                        offsets=offsets[px],
                        verbose=False
                      )
        
        
                    final_output.append(output_sentence)

        preds_df.loc[:, mf] = final_output

    # reshape output
    preds_df = preds_df.melt(id_vars = 'textID', var_name='model', value_name='selected_text')
    
    preds_df.to_csv('predictions_voting.csv', index=False)
    
    return preds_df


In [22]:
def predict_train(n_sample=None):
  df_train = pd.read_csv(config.FOLDED_TRAINING_FILE)
  
  if n_sample:
    df_train = df_train.sample(n_sample)
  
  final_output = []

  for mf in os.listdir(config.MODEL_PATH):
    if not mf.endswith('.bin'):
      continue

    model = init_model(config)
    
    model.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    model.eval()
    # ensure we get output probabilities for all combinations of start and end position
    model.start_n_top = config.MAX_LEN
    model.end_n_top = config.MAX_LEN
    model.to(device)

    fold = int(re.findall('model_(\d).bin', mf)[0])
    
    if df_train.pipe(lambda x:x[x.kfold==fold]).shape[0]==0:
      continue
    
    train_dataset = TweetDataset(
            tweet=df_train.pipe(lambda x:x[x.kfold==fold]).text.values,
            sentiment=df_train.pipe(lambda x:x[x.kfold==fold]).sentiment.values,
            selected_text=df_train.pipe(lambda x:x[x.kfold==fold]).selected_text.values
        )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=False,
        batch_size=8, #config.VALID_BATCH_SIZE,
        num_workers=0
    )

    tk0 = tqdm(train_data_loader, total=len(train_data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        # run it again to get the probabilities
        # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
        start_top_probs = outputs[0]  
      
        # start_top_probs contain those positions' probabilities
        # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
        start_top_index = outputs[1] 

        # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
        # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
        # this can be verified by checking summation to unity
        end_top_probs = outputs[2] 
        end_top_index = outputs[3] 
        
        # calculate joint probability of start, end position tuples
        start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

        # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
        mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
        _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

        start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                  indices]

        # get (flat) position in sequence of highest probability tuple
        top_start_end_probs_sorted = start_end_probs_sorted.argmax(dim=1)

        # convert flat position to separate start and end positions
        start_top_positions = (top_start_end_probs_sorted % torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
        end_top_positions = (top_start_end_probs_sorted // torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
        
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            start_top_position = start_top_positions[px]
            end_top_position = end_top_positions[px]
            
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=start_top_position,
                idx_end=end_top_position,
                offsets=offsets[px]
            )
            final_output.append({'text':tweet, 'prediction':output_sentence})
     
    del model, train_dataset, train_data_loader
    gc.collect()

  df_train = df_train.merge(pd.DataFrame(final_output), on='text', how='inner')

  return df_train


In [23]:
def gen_probs_test():
  df_test = pd.read_csv(config.TESTING_FILE)#.pipe(lambda x: x[x.text.str.contains('long')])#.head(32)

  models = []

  for mf in os.listdir(config.MODEL_PATH):#[0:1]:
    if not mf.endswith('.bin'):
        continue
    m = init_model(config)
    
    m.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    m.eval()
    # ensure we get output probabilities for all combinations of start and end position
    m.start_n_top = config.MAX_LEN#2#config.MAX_LEN
    m.end_n_top = config.MAX_LEN#3#
    m.to(device)

    models.append(m)

  test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.text.values
      )

  test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
  )

  final_output_start = []
  final_output_end = []
  final_tweets = []

  with torch.no_grad():
      tk0 = tqdm(test_data_loader, total=len(test_data_loader))
      for bi, d in enumerate(tk0):
          ids = d["ids"]
          token_type_ids = d["token_type_ids"]
          mask = d["mask"]
          sentiment = d["sentiment"]
          orig_selected = d["orig_selected"]
          orig_tweet = d["orig_tweet"]
          targets_start = d["targets_start"]
          targets_end = d["targets_end"]
          offsets = d["offsets"].numpy().tolist()

          ids = ids.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          targets_start = targets_start.to(device, dtype=torch.long)
          targets_end = targets_end.to(device, dtype=torch.long)
          
          summed_start_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN).to(device) # config.MAX_LEN
          summed_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN).to(device)  # config.MAX_LEN*config.MAX_LEN

          for model in models: 
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )

            # start_top_index contains the model.start_n_top highest probability starting token positions, in decreasing order of probability (for each sample)
            start_top_probs = outputs[0]  
            
            # start_top_probs contain those token' probabilities (for each sample)
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1] 
            start_top_index = outputs[1] 
            
            # sort start_top_probs so that element (i,j) represents the probability that token j is the start token of tweet i
            _, indices = torch.sort(start_top_index, dim=1)
            start_top_probs_sorted = start_top_probs[torch.repeat_interleave(torch.arange(start_top_probs.shape[0]), start_top_probs.shape[1]).view(start_top_probs.shape),
                      indices]

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end token  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 

            # sort end_top_probs by position of element (rather than by its probability)
            # resulting dimensions: n_sample, end_n_top, start_n_top
            _, indices = torch.sort(end_top_index, dim=1)
            end_top_probs_sorted = end_top_probs[torch.repeat_interleave(torch.arange(end_top_probs.shape[0]), end_top_probs.shape[1]).view(end_top_probs.shape),
                      indices]
            
            # average the end position probabilities across start positions
            end_top_probs_sorted = end_top_probs_sorted.view([end_top_probs_sorted.shape[0], model.end_n_top, model.start_n_top]).mean(dim=2)
   
            summed_start_probs_sorted += start_top_probs_sorted
            summed_end_probs_sorted += end_top_probs_sorted

          avg_start_probs_sorted = (summed_start_probs_sorted/torch.tensor(len(models))).cpu().detach().numpy().tolist()
          avg_end_probs_sorted = (summed_end_probs_sorted/torch.tensor(len(models))).cpu().detach().numpy().tolist()
          
          # convert starting and ending token probabilities to starting and ending character probabilities
          for i, t in enumerate(orig_tweet):
            start_char_probs = [0]*len(t)
            end_char_probs = [0]*len(t)
            for j,o in enumerate(offsets[i]):
                if o==[0,0]: continue
                try:
                    start_char_probs[o[0]] = avg_start_probs_sorted[i][j]
                    end_char_probs[o[1]-1] = avg_end_probs_sorted[i][j]
                except:
                    print('offsets: '+str(o))
                    print('len(tweet):'+str(len(t)))
                    print('len(start_char_probs): '+str(len(start_char_probs)))
                    print('tweet: '+str(t))
                    print('segment: '+str(orig_tweet[o[0]:o[1]]))
                    raise()
                    
            
            final_output_start.append(start_char_probs)
            final_output_end.append(end_char_probs)
          final_tweets.extend(orig_tweet)
            
  df_test.loc[:, 'start_position_probs'] = final_output_start
  df_test.loc[:, 'end_position_probs'] = final_output_end
  df_test.loc[:, 'orig_tweet'] = final_tweets

  df_test.to_csv('start_end_predictions.csv', index=False)

  return df_test

In [24]:
# df = gen_probs_test()

In [25]:
# for i in range(df.tail(20).shape[0]):
#     start_max = pd.Series(df.iloc[i]['start_position_probs']).idxmax()
#     end_max = pd.Series(df.iloc[i]['end_position_probs']).idxmax()
#     print(df.iloc[i]['orig_tweet'], df.iloc[i]['orig_tweet'][start_max:end_max+1])

In [26]:
IN_KAGGLE_COMMIT = False
if (not IN_COLAB) and ('runtime' not in get_ipython().config.IPKernelApp.connection_file):
   IN_KAGGLE_COMMIT = True


print(IN_KAGGLE_COMMIT)

True


In [27]:
 %%time
 
if IN_COLAB:
    run_training()

if IN_KAGGLE_COMMIT:
    #predict_test()
    #gen_probs_test()
    predict_test_for_voting()

  0%|          | 0/221 [00:00<?, ?it/s]

/kaggle/input/xlnetmodel06022/model_0.bin


100%|██████████| 221/221 [00:46<00:00,  4.70it/s]
  0%|          | 0/221 [00:00<?, ?it/s]

/kaggle/input/xlnetmodel06022/model_2.bin


100%|██████████| 221/221 [00:46<00:00,  4.78it/s]
  0%|          | 0/221 [00:00<?, ?it/s]

/kaggle/input/xlnetmodel06022/model_4.bin


100%|██████████| 221/221 [00:46<00:00,  4.78it/s]
  0%|          | 0/221 [00:00<?, ?it/s]

/kaggle/input/xlnetmodel06022/model_1.bin


100%|██████████| 221/221 [00:46<00:00,  4.78it/s]
  0%|          | 0/221 [00:00<?, ?it/s]

/kaggle/input/xlnetmodel06022/model_3.bin


100%|██████████| 221/221 [00:46<00:00,  4.77it/s]

CPU times: user 2min 49s, sys: 1min 23s, total: 4min 12s
Wall time: 4min 19s



