Code taken/adapted from:
* https://www.youtube.com/watch?v=U51ranzJBpY [ TOKENISER ]
* https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch [ TRAINING ]
* https://www.kaggle.com/abhishek/roberta-inference-5-folds [ INFERENCE ] 
* https://www.kaggle.com/masterscrat/detect-if-notebook-is-running-interactively [ CHECK WHERE NOTEBOOK IS RUNNING ]
* https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/141502 [ SUBMISSION ]

Notes 
This model is based on XLNet 0605_5 but  
* Adds back the leading whitespace to the tweets that I had removed


In [0]:
!pip install transformers
!pip install tokenizers
!pip install protobuf

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.8MB/s 
[?25hCollecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 13.4MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 42.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K   

In [0]:
try:
    from google.colab import drive
    IN_COLAB = True
    drive.mount('/content/drive')
    !wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py
    !wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
except:
    IN_COLAB = False
    
    import sys
    sys.path.append('/kaggle/input/sentencepiece-pb2/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
--2020-06-15 13:12:23--  https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7382 (7.2K) [text/plain]
Saving to: ‘sentencepiece_pb2.py’


2020-06-15 13

## Import library

In [0]:
import os

from pathlib import Path
import numpy as np
import pandas as pd
import os
import tokenizers
import string
import torch
import transformers
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import re
import sentencepiece as spm
import sentencepiece_pb2
import gc
import html
import random
from sklearn import model_selection

In [0]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [0]:
class SentencePieceTokenizer:
    def __init__(self, model_name):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(model_name)
    
    def encode(self, sentence):
        spt = sentencepiece_pb2.SentencePieceText()
        spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
        offsets = []
        ids = []
        for piece in spt.pieces:
            ids.append(piece.id)
            offsets.append((piece.begin, piece.end))
        return {'ids' : ids,
                'offsets' : offsets}

In [0]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 16 # 32 #64
    VALID_BATCH_SIZE =  16
    EPOCHS = 10
    
    MODEL_CONFIG = transformers.XLNetConfig
    MODEL = transformers.XLNetForQuestionAnswering
    if IN_COLAB:
        BASE_PATH = Path.cwd() / "drive" / "My Drive" / "kaggle" / "tweet_sentiment_extraction"     
        MODEL_PATH = BASE_PATH  / "model_save" / "model_0613_6"
        FOLDED_TRAINING_FILE = BASE_PATH / "input" / "train-5fold" / "train_folds.csv"
        TRAINING_FILE = BASE_PATH / "input" / "train.csv"
        TESTING_FILE = BASE_PATH  / "input" / "test.csv"
        SAMPLE_SUBMISSION_FILE = BASE_PATH / "input" / "sample_submission.csv"
        SUBMISSION_FILE = BASE_PATH / "input" / "submission.csv"
        SLANG_FILE = BASE_PATH / "input" / "slang_abbreviations.csv"
        EMOJIS_FILE = BASE_PATH / "input" / "emojis.csv"
    else:
        BASE_PATH = Path('/kaggle')
        MODEL_PATH = BASE_PATH  / "input" / "xlnetmodel06136"
        FOLDED_TRAINING_FILE = BASE_PATH / "working" / "train_folds.csv"
        TRAINING_FILE = BASE_PATH  / "input" / "tweet-sentiment-extraction" / "train.csv"
        TESTING_FILE = BASE_PATH  / "input" / "tweet-sentiment-extraction" / "test.csv"
        SAMPLE_SUBMISSION_FILE = BASE_PATH / "input" / "tweet-sentiment-extraction" / "sample_submission.csv"
        SUBMISSION_FILE = BASE_PATH / "working" / "submission.csv"
        SLANG_FILE = BASE_PATH / "input" / "slang-abbreviations" / "slang_abbreviations.csv"
        EMOJIS_FILE = BASE_PATH / "input" / "slang-abbreviations" / "emojis.csv"
      
    
    PRETRAINED_MODEL_DIR = BASE_PATH / "input" / "xlnetbasecased"
    TOKENIZER = SentencePieceTokenizer(str(PRETRAINED_MODEL_DIR / 'xlnet-base-cased-spiece.model'))
    SLANG_DICT = pd.read_csv(SLANG_FILE, header=None, names=['slang', 'normalised']).set_index('slang').to_dict()['normalised']
    EMOJI_DICT = pd.read_csv(EMOJIS_FILE, header=None, names=['emoji', 'normalised']).set_index('emoji').to_dict()['normalised']

In [0]:
[config.TOKENIZER.sp.id_to_piece(x) for x in range(0,10)]

['<unk>',
 '<s>',
 '</s>',
 '<cls>',
 '<sep>',
 '<pad>',
 '<mask>',
 '<eod>',
 '<eop>',
 '.']

In [0]:
[config.TOKENIZER.sp.piece_to_id(x) for x in ['positive', 'negative', 'neutral']]

[19036, 25976, 24734]

## Utils

In [0]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 24
seed_everything(seed) 

In [0]:
def create_train_folds():
    df = pd.read_csv(config.TRAINING_FILE)
    df = df.dropna().reset_index(drop=True)
    df["kfold"] = -1

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.StratifiedKFold(n_splits=5, random_state=seed)

    for fold, (trn_, val_) in enumerate(kf.split(X=df, y=df.sentiment.values)):
        print(len(trn_), len(val_))
        df.loc[val_, 'kfold'] = fold

    df.to_csv(config.FOLDED_TRAINING_FILE, index=False)

create_train_folds()



21984 5496
21984 5496
21984 5496
21984 5496
21984 5496


In [0]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


class EarlyStopping:
    # https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model, name):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, name)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, name):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), name)
        self.val_loss_min = val_loss

## Data processing

In [0]:
def update_char_map(char_map, char_pos_patterns, len_repl):

  for ent in char_pos_patterns:
    ent_pos = ent.span()
    char_map[ent_pos[0]+1:] = [char_map[ent_pos[0]]+len_repl-1 if i<ent_pos[1] else char_map[i]-(ent_pos[1]-ent_pos[0])+len_repl for i in range(ent_pos[0]+1,len(char_map)) ]

  return(char_map)


def normalise_tweet(tweet, slang_dict):#, max_len):
  """
  Clean tweet by decoding html character references, 
  replacing URLs, removing repeated characters and 
  replacing slang and emojis.
  """
  char_map = list(range(len(tweet)))
  
  # bugfix for one sample in train set. (a8048c2ff5)
  clean_tweet = re.sub('&not', ' not', tweet)

  # replace html  references corresponding with unicode character (e.g. $amp: to &)
  clean_tweet = html.unescape(clean_tweet)
  # update character position mapping to reflect named character references 
  # ignores html character references that don't end with ';' to avoid double matches
  char_pos_html_entities = []  
  [char_pos_html_entities.extend(list(re.finditer('&'+k, tweet))) for k in html.entities.html5.keys() if ((k[-1]==';') or (k=='')) and (re.search('&'+k, tweet))]
  char_map = update_char_map(char_map, char_pos_html_entities, 1)

  # update character position mapping to reflect _numerical_ character references 
  char_pos_html_numerical_entities = []  
  [char_pos_html_numerical_entities.extend(list(re.finditer('&#'+str(k), tweet))) for k in html.entities.codepoint2name.keys() if re.search('&#'+str(k), tweet)]
  char_map = update_char_map(char_map, char_pos_html_numerical_entities, 1)
  
  # bugfix for one sample in test set.
  clean_tweet = re.sub('Â¡', 'Ai', clean_tweet)

  # replace URLs with "URL"
  clean_tweet = re.sub(r'(http://[^\s-]*)', 'URL', clean_tweet)
  # update character position mapping to reflect this
  if re.search(r'(http://[^\s-]*)', tweet):
    char_pos_urls = re.finditer(r'(http://[^\s-]*)', tweet)
    char_map = update_char_map(char_map, char_pos_urls, 3)

  # replace unicode replacement character with "'"
  clean_tweet = re.sub(r'(ï¿½)', "'", clean_tweet)
  # update character position mapping to reflect this
  if re.search(r'(ï¿½)', tweet):
    char_pos_urls = re.finditer(r'(ï¿½)', tweet)
    char_map = update_char_map(char_map, char_pos_urls, 1)
 

  # replace letters or exclamation marks that are repeated >2 times consecutively (except "www.")
  # with a single character (e.g. sorryyyyyy -> sorry)
  # doesn't work perfectly, e.g. sleeeeeeep. -> slep.
  clean_tweet = re.sub(r'(?!www.)([a-zA-Z\!])\1{2,}', '\\1', clean_tweet, flags=re.I)
  # update character position mapping to reflect this
  # note: urls are replaced with #s so that any repetitions within URL are ignored
  char_pos_repeats = re.finditer(r'(?!www.)([a-zA-Z\!])\1{2,}', re.sub(r'(http://[^\s-]*)|(ï¿½)', lambda x: '#'*len(x.group()), tweet), flags=re.I)
  char_map = update_char_map(char_map, char_pos_repeats, 1)

  # # replace slang abbreviations with real words
  # # https://www.webopedia.com/quick_ref/textmessageabbreviations.asp
  new_clean_tweet = ''
  change = 0
  for i,e in enumerate(re.finditer('([0-9a-zA-Z]+|[^0-9a-zA-Z]+)', clean_tweet)):
    
    if e.group().upper() in config.SLANG_DICT.keys():
      new_clean_tweet = new_clean_tweet + config.SLANG_DICT[e.group().upper()]

      # 1. Find position of chunk to be replaced in 'clean_tweet'
      # 2. Use that position to locate the corresponding chunk in the  char_map
      # 3. Update that chunk and everything after it in char_map using the position that that chunk will have (not its position in clean_tweet)
      len_repl = len(config.SLANG_DICT[e.group().upper()])
      ent_pos = e.span()
      
      ent_pos = tuple(x+change for x in ent_pos)

      min_ix = min([j for j,x in enumerate(char_map) if x>=ent_pos[0]])
      max_ix = max([j for j,x in enumerate(char_map) if x<ent_pos[1]])

      char_map[min_ix+1:] = [ent_pos[0]+len_repl-1 if k<max_ix else char_map[k]-(ent_pos[1]-ent_pos[0])+len_repl for k in range(min_ix+1, len(char_map))]
      change += len_repl - (ent_pos[1]-ent_pos[0])
    else:
      new_clean_tweet = new_clean_tweet + e.group()

  clean_tweet = new_clean_tweet

  # # replace emoticons with real words
  # # https://en.wikipedia.org/wiki/List_of_emoticons
  new_clean_tweet = ''
  change = 0
  for i,e in enumerate(re.finditer('(\s+|\S+)', clean_tweet)):
    
    if e.group().upper() in config.EMOJI_DICT.keys():
      new_clean_tweet = new_clean_tweet + config.EMOJI_DICT[e.group().upper()]

      # 1. Find position of chunk to be replaced in 'clean_tweet'
      # 2. Use that position to locate the corresponding chunk in the  char_map
      # 3. Update that chunk and everything after it in char_map using the position that that chunk will have (not its position in clean_tweet)
      len_repl = len(config.EMOJI_DICT[e.group().upper()])
      ent_pos = e.span()
      
      ent_pos = tuple(x+change for x in ent_pos)

      min_ix = min([j for j,x in enumerate(char_map) if x>=ent_pos[0]])
      max_ix = max([j for j,x in enumerate(char_map) if x<ent_pos[1]])

      char_map[min_ix+1:] = [ent_pos[0]+len_repl-1 if k<max_ix else char_map[k]-(ent_pos[1]-ent_pos[0])+len_repl for k in range(min_ix+1, len(char_map))]
      change += len_repl - (ent_pos[1]-ent_pos[0])
    else:
      new_clean_tweet = new_clean_tweet + e.group()

  clean_tweet = new_clean_tweet
  
  #char_map = [min(x, max_len-1) for x in char_map]

  char_map_inverse = (pd.Series([max([j for j,k in enumerate(char_map) if k==i], default=None) for i in range(len(clean_tweet))])
  .fillna(method='backfill')
  .fillna(len(tweet)-1)
  .astype(int)
  .values
  .tolist())

  return clean_tweet, char_map, char_map_inverse
  
  
def process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict):
    """
    Preprocessing the data to the XLNet model formatting
    """
#     tweet = 
#     selected_text = 

    raw_tweet = " " + " ".join(str(tweet).split()) #tweet
    raw_selected_text = " " + " ".join(str(selected_text).split()) #selected_text

    # find start and indices of selected_text in tweet
    len_st = len(raw_selected_text) - 1
    raw_idx0 = None
    raw_idx1 = None

    for ind in (i for i, e in enumerate(raw_tweet) if e == raw_selected_text[1]):
      if " " + raw_tweet[ind: ind+len_st] == raw_selected_text:
            raw_idx0 = ind
            raw_idx1 = ind + len_st - 1
            break

    tweet, char_map, char_map_inverse = normalise_tweet(raw_tweet, slang_dict)#, max_len)
    
    try:
        idx0 = char_map[raw_idx0]
        idx1 = char_map[raw_idx1]
    except:
        print('raw tweet: '+str(raw_tweet))
        print('cleaned tweet: '+str(tweet))
        print('rawidx0: '+str(raw_idx0))
        print('idx0: '+str(idx0))
        print('rawidx1: '+str(raw_idx1))
        print('char_map: '+str(char_map))
        print('len char_map: '+str(len(char_map)))
        raise
    
    selected_text = tweet[idx0:(idx1+1)]

    try:
        # create character mask for selected_text in tweet
        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1
    except:
        print('raw tweet: '+str(raw_tweet))
        print('cleaned tweet: '+str(tweet))
        print('char_targets: '+str(char_targets))
        print('char map: '+str(char_map))
        print(len(char_map))
        print(len(char_targets))
        print('idx0: '+str(idx0))
        print('idx1: '+str(idx1))
        raise
    
    tok_tweet = tokenizer.encode(tweet)
    
    input_ids_orig = tok_tweet['ids']
    tweet_offsets = tok_tweet['offsets']
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    

    try:
      targets_start = target_idx[0]
      targets_end = target_idx[-1]
    except:
      print(idx0)
      print(idx1)
      print(char_targets)
      print(tweet)
      print(selected_text)
      print(target_idx)
      raise

    #######
    sentiment_id = {
        'positive': 19036,
        'negative': 25976,
        'neutral': 24734
    }
    #######
    
    # https://huggingface.co/transformers/model_doc/xlnet.html#transformers.XLNetTokenizer.build_inputs_with_special_tokens
    input_ids = [sentiment_id[sentiment]] + [4] + input_ids_orig + [4] + [3]
    #input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0]*2 + [1] * (len(input_ids_orig)+1) + [2]
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 2 + tweet_offsets + [(0, 0)] * 2
    targets_start += 2
    targets_end += 2

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([5] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets,
        'raw_tweet': raw_tweet,
        'raw_selected_text': raw_selected_text,
        'char_map_inverse': "_".join([str(x) for x in char_map_inverse]),
        'char_map': str(char_map)
    }

In [0]:
# tweet = "&not gonna lie. it`s 60 degrees in here. thanks for leavin me your sweater molly. brrrrr"
# selected_text = 'thanks'

# raw_tweet = " " + " ".join(str(tweet).split()) #tweet
# raw_selected_text = " " + " ".join(str(selected_text).split()) #selected_text
# print(raw_tweet)
# print(raw_selected_text)
# print(raw_selected_text[0])

In [0]:
# tweet, char_map, char_map_inverse = normalise_tweet(raw_tweet, config.SLANG_DICT)#, max_len)
# print(tweet)

In [0]:
# tok_tweet = config.TOKENIZER.encode(tweet)

# input_ids_orig = tok_tweet['ids']
# tweet_offsets = tok_tweet['offsets']
# print(tweet)
# print([config.TOKENIZER.sp.id_to_piece(x) for x in input_ids_orig])
# print(tweet_offsets)
# print([tweet[o[0]:o[1]] for o in tweet_offsets])

In [0]:

# # find start and indices of selected_text in tweet
# len_st = len(raw_selected_text) - 1
# raw_idx0 = None
# raw_idx1 = None

# for ind in (i for i, e in enumerate(raw_tweet) if e == raw_selected_text[1]):
#   if " " + raw_tweet[ind: ind+len_st] == raw_selected_text:
#         raw_idx0 = ind
#         raw_idx1 = ind + len_st - 1
#         break
# print(raw_idx0)
# print(raw_idx1)
# print(raw_tweet[ind: ind+len_st])

In [0]:

# list(enumerate(raw_tweet))

In [0]:
# tweet = "&not gonna lie. it`s 60 degrees in here. thanks for leavin me your sweater molly. brrrrr"
# selected_text = 'thanks'
# sentiment = 'positive'
# tokenizer = config.TOKENIZER
# max_len = 128
# slang_dict = config.SLANG_DICT

# print(process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict)['offsets'])
# print([config.TOKENIZER.sp.id_to_piece(x) for x in process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict)['ids']])
# print(process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict)['orig_tweet'])
# print(process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict)['raw_tweet'])



#    char_map_inverse = [int(x) for x in char_map_inverse.split('_')]
# raw_char_idx_start = char_map_inverse[int(offsets[idx_start][0])]


## Data loader

In [0]:
class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        self.slang_dict = config.SLANG_DICT
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len,
            self.slang_dict
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long),
            'raw_tweet': data["raw_tweet"],
            'raw_selected_text': data['raw_selected_text'],
            'char_map_inverse': data["char_map_inverse"],
            'char_map': data["char_map"]
        }

## Loss function

In [0]:
# def loss_fn(start_logits, end_logits, start_positions, end_positions):
#     loss_fct = nn.CrossEntropyLoss()
#     start_loss = loss_fct(start_logits, start_positions)
#     end_loss = loss_fct(end_logits, end_positions)
#     total_loss = (start_loss + end_loss)
#     return total_loss

def loss_fn(start_logprobs, end_logprobs, start_positions, end_positions):
    loss_fct = nn.NLLLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss
    

## Training function

In [0]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    jaccards = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        model.zero_grad()
        
        # outputs_start, outputs_end = model(
        #     ids=ids,
        #     mask=mask,
        #     token_type_ids=token_type_ids,
        # )
        # loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        # loss.backward()

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            start_positions=targets_start, 
            end_positions=targets_end
        )
        
        loss = outputs[0]
        loss.backward()

        optimizer.step()
        scheduler.step()

        # outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        # outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        
        # jaccard_scores = []
        # for px, tweet in enumerate(orig_tweet):
        #     selected_tweet = orig_selected[px]
        #     tweet_sentiment = sentiment[px]
        #     jaccard_score, _ = calculate_jaccard_score(
        #         original_tweet=tweet,
        #         target_string=selected_tweet,
        #         sentiment_val=tweet_sentiment,
        #         idx_start=np.argmax(outputs_start[px, :]),
        #         idx_end=np.argmax(outputs_end[px, :]),
        #         offsets=offsets[px]
        #     )
        #     jaccard_scores.append(jaccard_score)

        # jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)#, jaccard=jaccards.avg)

## Evaluation function

In [0]:
def calculate_jaccard_score(
    raw_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    char_map_inverse,
    cleaned_tweet=None,
    verbose=False):
    
    char_map_inverse = [int(x) for x in char_map_inverse.split('_')]

    if idx_end < idx_start:
        idx_end = idx_start
        
    # raw_char_idx_start = int(max([i for i,e in enumerate(char_map) if e<=offsets[idx_start][0]]+[0]))
    # raw_char_idx_end = int(min([i for i,e in enumerate(char_map) if e>=offsets[idx_end][1]]+[len(char_map)]))

    raw_char_idx_start = char_map_inverse[int(offsets[idx_start][0])]
    try:
      raw_char_idx_end = char_map_inverse[int(offsets[idx_end][1])-1]
    except:
      print('\nraw tweet: '+str(raw_tweet))
      print('cleaned tweet: '+str(cleaned_tweet))
      print('char map: '+str(char_map_inverse))
      print('index start:'+str(idx_start))
      print('index end:'+str(idx_end))
      print('offsets: '+str(offsets))
      print(len(offsets))
      print(len(char_map_inverse))
      print(offsets[idx_end])
      print(offsets[idx_end][1])
      print(int(offsets[idx_end][1]))
      print(char_map_inverse[int(offsets[idx_end][1])])
      raise()

    # filtered_output  = ""
    # for ix in range(idx_start, idx_end + 1):
    #     filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]

    #     # add spacing to output
    #     if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
    #         filtered_output += " "

    filtered_output = raw_tweet[raw_char_idx_start:raw_char_idx_end+1]

    if sentiment_val == "neutral" or len(raw_tweet.split()) < 2:
        filtered_output = raw_tweet

    if sentiment_val != "neutral" and verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {raw_tweet.strip()}")
            print("********************************")

    jac = jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


def eval_fn(data_loader, model, device):
    model.eval()
    losses = AverageMeter()
    jaccards = AverageMeter()
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"]
            raw_tweet = d["raw_tweet"]
            raw_selected = d["raw_selected_text"]
            char_map_inverse = d["char_map_inverse"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)
            #char_map_inverse = char_map_inverse.to(device, dtype=torch.long)

            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids,
                start_positions=targets_start,
                end_positions=targets_end
            )
            loss = outputs[0]
            
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )
            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
            start_top_probs = outputs[0]  
          
            # start_top_probs contain those positions' probabilities
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
            start_top_index = outputs[1] 

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 
            
            # calculate joint probability of start, end position tuples
            start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

            # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
            mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
            _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

            start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                      indices]

            # get (flat) position in sequence of highest probability tuple
            top_start_end_probs_sorted = start_end_probs_sorted.argmax(dim=1)

            # convert flat position to separate start and end positions
            start_top_positions = (top_start_end_probs_sorted % torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
            end_top_positions = (top_start_end_probs_sorted // torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
            
            jaccard_scores = []
            for px, tweet in enumerate(raw_tweet):
                tweet_raw_selected_text = raw_selected[px]
                tweet_sentiment = sentiment[px]
                tweet_offsets = offsets[px]
                tweet_char_map_inverse = char_map_inverse[px]

                start_top_position = start_top_positions[px]
                end_top_position = end_top_positions[px]
                
                cleaned_tweet = orig_tweet[px]
                
                jaccard_score, _ = calculate_jaccard_score(
                    raw_tweet=tweet,
                    target_string=tweet_raw_selected_text,
                    sentiment_val=tweet_sentiment,
                    idx_start=start_top_position,
                    idx_end=end_top_position,
                    offsets=tweet_offsets,
                    char_map_inverse=tweet_char_map_inverse,
                    cleaned_tweet=cleaned_tweet
                )
                jaccard_scores.append(jaccard_score)

            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
    
    print(f"Jaccard = {jaccards.avg}")
    print(f"Loss = {losses.avg}")
    return jaccards.avg, losses.avg

## Training 

In [0]:
def init_model(config):
    model_config = config.MODEL_CONFIG.from_pretrained(config.PRETRAINED_MODEL_DIR )#/ "config.json")
    model_config.output_hidden_states = True
    model_config.start_n_top = config.MAX_LEN
    model_config.end_n_top = config.MAX_LEN
    #'/kaggle/input/xlnet-base-tf/xlnet-base-cased'
    model = config.MODEL.from_pretrained(config.PRETRAINED_MODEL_DIR, config=model_config)#, state_dict='/kaggle/input/xlnetmodel05081/model_3.bin')
    
    return model

In [0]:
def run_fold(fold):

    dfx = pd.read_csv(config.FOLDED_TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
    
    device = torch.device("cuda")

    # initialise model
    model = init_model(config)
    
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    es = EarlyStopping(patience=2, verbose=True)
    print(f"Training is Starting for fold={fold}")
    
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard, loss = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        print(f"Loss score = {loss}")
        es(loss, model, name=config.MODEL_PATH / f"model_{fold}.bin")
        
        if es.early_stop:
            print("Early stopping")
            break
  
    return es.val_loss_min

In [0]:
def run_val_fold(fold):

    dfx = pd.read_csv(config.FOLDED_TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
    
    device = torch.device("cuda")

    print(f"Evaluating fold={fold}")

    # initialise model
    model = init_model(config)
    model_filename = 'model_'+str(fold)+'.bin'
    model.load_state_dict(torch.load(config.MODEL_PATH / model_filename, map_location=device))
    model.to(device)
    
    jaccard, loss = eval_fn(valid_data_loader, model, device)

    return jaccard

## Run training

In [0]:
def run_training():
  if not os.path.exists(config.MODEL_PATH):
    os.mkdir(config.MODEL_PATH)
  val_loss = []
  for ifold in [0]:#range(5):
      q = run_fold(ifold)
      val_loss.append(q)
  print(f'Mean val loss: {np.mean(val_loss)}')

In [0]:
def get_cv_loss():
  val_jaccard = []
  for ifold in range(5):
      q = run_val_fold(ifold)
      val_jaccard.append(q)
  print(f'Mean val loss: {np.mean(val_jaccard)}')

## Predict test set

In [0]:
def predict_test():
  df_test = pd.read_csv(config.TESTING_FILE)
  df_test.loc[:, "selected_text"] = df_test.text.values

  models = []

  for mf in os.listdir(config.MODEL_PATH):
    m = init_model(config)
    
    m.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    m.eval()
    # ensure we get output probabilities for all combinations of start and end position
    m.start_n_top = config.MAX_LEN
    m.end_n_top = config.MAX_LEN
    m.to(device)

    models.append(m)

  test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.selected_text.values
      )

  test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
  )

  final_output = []

  with torch.no_grad():
      tk0 = tqdm(test_data_loader, total=len(test_data_loader))
      for bi, d in enumerate(tk0):
          ids = d["ids"]
          token_type_ids = d["token_type_ids"]
          mask = d["mask"]
          sentiment = d["sentiment"]
          orig_selected = d["orig_selected"]
          orig_tweet = d["orig_tweet"]
          targets_start = d["targets_start"]
          targets_end = d["targets_end"]
          offsets = d["offsets"].numpy()
          raw_tweet = d["raw_tweet"]
          raw_selected = d["raw_selected_text"]
          char_map_inverse = d["char_map_inverse"]

          ids = ids.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          targets_start = targets_start.to(device, dtype=torch.long)
          targets_end = targets_end.to(device, dtype=torch.long)
          
          summed_start_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN*config.MAX_LEN).to(device)

          for model in models: 
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )

            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
            start_top_probs = outputs[0]  
          
            # start_top_probs contain those positions' probabilities
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
            start_top_index = outputs[1] 

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 
            
            # calculate joint probability of start, end position tuples
            start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

            # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
            mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
            _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

            #start_end_probs_sorted = start_end_probs[torch.arange(start_end_probs.shape[0]), indices]
            start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                      indices]

            summed_start_end_probs_sorted += start_end_probs_sorted

          avg_start_end_probs_sorted = summed_start_end_probs_sorted/torch.tensor(len(models))

          # get (flat) position in sequence of highest probability tuple
          top_avg_start_end_probs_sorted = avg_start_end_probs_sorted.argmax(dim=1)

          # convert flat position to separate start and end positions
          start_top_positions = (top_avg_start_end_probs_sorted % torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          end_top_positions = (top_avg_start_end_probs_sorted // torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          
          jaccard_scores = []
          for px, tweet in enumerate(raw_tweet):
              raw_selected_text = raw_selected[px]
              tweet_sentiment = sentiment[px]
              tweet_offsets = offsets[px]
              tweet_char_map_inverse = char_map_inverse[px]

              start_top_position = start_top_positions[px]
              end_top_position = end_top_positions[px]
                
              cleaned_tweet = orig_tweet[px]

              _, output_sentence = calculate_jaccard_score(
                  raw_tweet=tweet,
                  target_string=raw_selected_text,
                  sentiment_val=tweet_sentiment,
                  idx_start=start_top_position,
                  idx_end=end_top_position,
                  offsets=tweet_offsets,
                  char_map_inverse=tweet_char_map_inverse,
                  cleaned_tweet=cleaned_tweet,
                  verbose=True
              )
              final_output.append(output_sentence)


  sample = pd.read_csv(config.SAMPLE_SUBMISSION_FILE)
  sample.loc[:, 'selected_text'] = final_output
  sample.to_csv("submission.csv", index=False)


In [0]:
def predict_test():
  df_test = pd.read_csv(config.TESTING_FILE)
  df_test.loc[:, "selected_text"] = df_test.text.values

  models = []

  for mf in os.listdir(config.MODEL_PATH):
    m = init_model(config)
    
    m.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    m.eval()
    # ensure we get output probabilities for all combinations of start and end position
    m.start_n_top = config.MAX_LEN
    m.end_n_top = config.MAX_LEN
    m.to(device)

    models.append(m)

  test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.selected_text.values
      )

  test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
  )

  final_output = []

  with torch.no_grad():
      tk0 = tqdm(test_data_loader, total=len(test_data_loader))
      for bi, d in enumerate(tk0):
          ids = d["ids"]
          token_type_ids = d["token_type_ids"]
          mask = d["mask"]
          sentiment = d["sentiment"]
          orig_selected = d["orig_selected"]
          orig_tweet = d["orig_tweet"]
          targets_start = d["targets_start"]
          targets_end = d["targets_end"]
          offsets = d["offsets"].numpy()
          raw_tweet = d["raw_tweet"]
          raw_selected = d["raw_selected_text"]
          char_map_inverse = d["char_map_inverse"]

          ids = ids.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          targets_start = targets_start.to(device, dtype=torch.long)
          targets_end = targets_end.to(device, dtype=torch.long)
          
          summed_start_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN*config.MAX_LEN).to(device)

          for model in models: 
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )

            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
            start_top_probs = outputs[0]  
          
            # start_top_probs contain those positions' probabilities
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
            start_top_index = outputs[1] 

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 
            
            # calculate joint probability of start, end position tuples
            start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

            # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
            mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
            _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

            #start_end_probs_sorted = start_end_probs[torch.arange(start_end_probs.shape[0]), indices]
            start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                      indices]

            summed_start_end_probs_sorted += start_end_probs_sorted

          avg_start_end_probs_sorted = summed_start_end_probs_sorted/torch.tensor(len(models))

          # get (flat) position in sequence of highest probability tuple
          top_avg_start_end_probs_sorted = avg_start_end_probs_sorted.argmax(dim=1)

          # convert flat position to separate start and end positions
          start_top_positions = (top_avg_start_end_probs_sorted % torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          end_top_positions = (top_avg_start_end_probs_sorted // torch.tensor(config.MAX_LEN).to(device)).cpu().detach().numpy()
          
          jaccard_scores = []
          for px, tweet in enumerate(raw_tweet):
              raw_selected_text = raw_selected[px]
              tweet_sentiment = sentiment[px]
              tweet_offsets = offsets[px]
              tweet_char_map_inverse = char_map_inverse[px]

              start_top_position = start_top_positions[px]
              end_top_position = end_top_positions[px]
                
              cleaned_tweet = orig_tweet[px]

              _, output_sentence = calculate_jaccard_score(
                  raw_tweet=tweet,
                  target_string=raw_selected_text,
                  sentiment_val=tweet_sentiment,
                  idx_start=start_top_position,
                  idx_end=end_top_position,
                  offsets=tweet_offsets,
                  char_map_inverse=tweet_char_map_inverse,
                  cleaned_tweet=cleaned_tweet,
                  verbose=True
              )
              final_output.append(output_sentence)


  sample = pd.read_csv(config.SAMPLE_SUBMISSION_FILE)
  sample.loc[:, 'selected_text'] = final_output
  sample.to_csv("predictions_voting.csv", index=False)


In [0]:
def gen_probs_test():
  df_test = pd.read_csv(config.TESTING_FILE)

  models = []

  for mf in os.listdir(config.MODEL_PATH):#[0:1]:
    if not mf.endswith('.bin'):
        continue
    m = init_model(config)
    
    m.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    m.eval()
    # ensure we get output probabilities for all combinations of start and end position
    m.start_n_top = config.MAX_LEN#2#config.MAX_LEN
    m.end_n_top = config.MAX_LEN#3#
    m.to(device)

    models.append(m)

  test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.text.values
      )

  test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
  )

  final_output_start = []
  final_output_end = []
  final_tweets = []

  with torch.no_grad():
      tk0 = tqdm(test_data_loader, total=len(test_data_loader))
      for bi, d in enumerate(tk0):
          ids = d["ids"]
          token_type_ids = d["token_type_ids"]
          mask = d["mask"]
          sentiment = d["sentiment"]
          orig_selected = d["orig_selected"]
          orig_tweet = d["orig_tweet"]
          targets_start = d["targets_start"]
          targets_end = d["targets_end"]
          char_map_inverse = d["char_map_inverse"]
          offsets = d["offsets"].numpy().tolist()
          raw_tweet = d["raw_tweet"]

#           # convert char_maps from strings back to lists
#           char_map = torch.tensor([eval(x) for x in char_map]).to(device, dtype=torch.long)
        
          ids = ids.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          targets_start = targets_start.to(device, dtype=torch.long)
          targets_end = targets_end.to(device, dtype=torch.long)
          
          summed_start_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN).to(device) # config.MAX_LEN
          summed_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN).to(device)  # config.MAX_LEN*config.MAX_LEN

          for model in models: 
            # run it again to get the probabilities
            # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
            outputs = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids
            )

            # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability (for each sample)
            start_top_probs = outputs[0]  
            
            # start_top_probs contain those positions' probabilities (for each sample)
            # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1] 
            start_top_index = outputs[1] 
            
            # sort start_top_probs so that element (i,j) represents the probability for tweet i of character j being the start position
            _, indices = torch.sort(start_top_index, dim=1)
            start_top_probs_sorted = start_top_probs[torch.repeat_interleave(torch.arange(start_top_probs.shape[0]), start_top_probs.shape[1]).view(start_top_probs.shape),
                      indices]

            # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
            # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
            # this can be verified by checking summation to unity
            end_top_probs = outputs[2] 
            end_top_index = outputs[3] 

            # sort end_top_probs by position of element (rather than by its probability)
            # resulting dimensions: n_sample, end_n_top, start_n_top
            _, indices = torch.sort(end_top_index, dim=1)
            end_top_probs_sorted = end_top_probs[torch.repeat_interleave(torch.arange(end_top_probs.shape[0]), end_top_probs.shape[1]).view(end_top_probs.shape),
                      indices]
            
            # average the end position probabilities across start positions
            end_top_probs_sorted = end_top_probs_sorted.view([end_top_probs_sorted.shape[0], model.end_n_top, model.start_n_top]).mean(dim=2)
   
            summed_start_probs_sorted += start_top_probs_sorted
            summed_end_probs_sorted += end_top_probs_sorted

          avg_start_probs_sorted = (summed_start_probs_sorted/torch.tensor(len(models))).cpu().detach().numpy()
          avg_end_probs_sorted = (summed_end_probs_sorted/torch.tensor(len(models))).cpu().detach().numpy()
                  
          # convert starting and ending token probabilities to starting and ending character probabilities
          for i, t in enumerate(raw_tweet):
            start_char_probs = [0]*len(t)
            end_char_probs = [0]*len(t)
            inverse_map = [int(x) for x in char_map_inverse[i].split('_')]
            for j,o in enumerate(offsets[i]):
                if o==[0,0]: continue
                try:
                    start_char_probs[inverse_map[o[0]]] = avg_start_probs_sorted[i][j]
                    end_char_probs[inverse_map[o[1]-1]] = avg_end_probs_sorted[i][j]
                except:
                    print('offsets: '+str(o))
                    print('len(tweet):'+str(len(t)))
                    print('len(start_char_probs): '+str(len(start_char_probs)))
                    print('tweet: '+str(t))
                    print('len(inverse_map): '+str(len(inverse_map)))
                    print('segment: '+str(orig_tweet[o[0]:o[1]]))
                    print(inverse_map[o[1]-1])
                    print(avg_end_probs_sorted[i][j])
                    raise()
            
            final_output_start.append(start_char_probs)
            final_output_end.append(end_char_probs)
          final_tweets.extend(raw_tweet)
                 
  df_test.loc[:, 'start_position_probs'] = final_output_start
  df_test.loc[:, 'end_position_probs'] = final_output_end
  df_test.loc[:, 'orig_tweet'] = final_tweets
  df_test.to_csv("start_end_predictions.csv", index=False)

  return df_test

In [0]:
def predict_test_for_voting():
    
    df_test = pd.read_csv(config.TESTING_FILE)
    df_test.loc[:, "selected_text"] = df_test.text.values

    test_dataset = TweetDataset(
          tweet=df_test.text.values,
          sentiment=df_test.sentiment.values,
          selected_text=df_test.selected_text.values
      )

    test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      shuffle=False,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=1
    )
    
    preds_df = df_test.loc[:, ['textID']]
    
    for mf in os.listdir(config.MODEL_PATH):
        if not mf.endswith('.bin'):
            continue
            
        model = init_model(config)
        model.load_state_dict(torch.load(config.MODEL_PATH / mf))
        print(config.MODEL_PATH / mf)

        model.eval()
        # ensure we get output probabilities for all combinations of start and end position
        model.start_n_top = config.MAX_LEN
        model.end_n_top = config.MAX_LEN
        model.to(device)
        
        final_output = []
        
        with torch.no_grad():

            tk0 = tqdm(test_data_loader, total=len(test_data_loader))

            for bi, d in enumerate(tk0):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                sentiment = d["sentiment"]
                orig_selected = d["orig_selected"]
                orig_tweet = d["orig_tweet"]
                targets_start = d["targets_start"]
                targets_end = d["targets_end"]
                offsets = d["offsets"].numpy()
                raw_tweet = d["raw_tweet"]
                raw_selected = d["raw_selected_text"]
                char_map_inverse = d["char_map_inverse"]
        
                ids = ids.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets_start = targets_start.to(device, dtype=torch.long)
                targets_end = targets_end.to(device, dtype=torch.long)

                summed_start_end_probs_sorted = torch.zeros(ids.shape[0], config.MAX_LEN*config.MAX_LEN).to(device)

                # run it again to get the probabilities
                # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
                outputs = model(
                    input_ids=ids,
                    attention_mask=mask,
                    token_type_ids=token_type_ids
                )

                # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
                sorted_start_probs = outputs[0]  

                # start_top_probs contain those positions' probabilities
                # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
                sorted_start_index = outputs[1] 

                # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
                # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
                # this can be verified by checking summation to unity
                sorted_end_probs = outputs[2] 
                sorted_end_index = outputs[3] 

                # calculate joint probability of start, end position tuples
                sorted_joint_probs = (sorted_start_probs.repeat(1, model.end_n_top)*sorted_end_probs)
                top_joint_index = sorted_joint_probs.argmax(dim=1)
                
                # convert flat position to separate start and end positions
                top_end_index = sorted_end_index[torch.arange(sorted_end_index.shape[0]), top_joint_index]
                top_start_index = sorted_start_index[torch.arange(sorted_start_index.shape[0]), top_joint_index % torch.tensor(config.MAX_LEN).to(device)]
                
                for px, tweet in enumerate(raw_tweet):  
                    _, output_sentence = calculate_jaccard_score(
                        raw_tweet=tweet,
                        target_string=raw_selected[px],
                        sentiment_val=sentiment[px],
                        idx_start=top_start_index[px],
                        idx_end=top_end_index[px],
                        offsets=offsets[px],
                        char_map_inverse=char_map_inverse[px],
                        cleaned_tweet=orig_tweet[px]
                    )
        
                    final_output.append(output_sentence)

        preds_df.loc[:, mf] = final_output

    # reshape output
    preds_df = preds_df.melt(id_vars = 'textID', var_name='model', value_name='selected_text')
    
    preds_df.to_csv('predictions_voting.csv', index=False)
    
    return preds_df


In [0]:
def predict_train(n_sample=None):
  df_train = pd.read_csv(config.TRAINING_FILE)
  
  if n_sample:
    df_train = df_train.sample(n_sample)
  
  final_output = []

  for mf in os.listdir(config.MODEL_PATH):

    model = init_model(config)
    
    model.load_state_dict(torch.load(config.MODEL_PATH / mf, map_location=device))
    print(config.MODEL_PATH / mf)
    model.eval()
    # ensure we get output probabilities for all combinations of start and end position
    model.start_n_top = config.MAX_LEN
    model.end_n_top = config.MAX_LEN
    model.to(device)

    fold = int(re.findall('model_(\d).bin', mf)[0])
    
    if df_train.pipe(lambda x:x[x.kfold==fold]).shape[0]==0:
      continue
    
    train_dataset = TweetDataset(
            tweet=df_train.pipe(lambda x:x[x.kfold==fold]).text.values,
            sentiment=df_train.pipe(lambda x:x[x.kfold==fold]).sentiment.values,
            selected_text=df_train.pipe(lambda x:x[x.kfold==fold]).selected_text.values
        )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        shuffle=False,
        batch_size=16, #config.VALID_BATCH_SIZE,
        num_workers=1
    )

    tk0 = tqdm(train_data_loader, total=len(train_data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"]
        raw_tweet = d["raw_tweet"]
        raw_selected = d["raw_selected_text"]
        char_map_inverse = d["char_map_inverse"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            start_positions=targets_start,
            end_positions=targets_end
        )
        loss = outputs[0]
        
        # run it again to get the probabilities
        # https://huggingface.co/transformers/model_doc/xlnet.html#xlnetforquestionanswering
        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        # start_top_index contains the model.start_n_top highest probability starting sequence positions, in decreasing order of probability
        start_top_probs = outputs[0]  
      
        # start_top_probs contain those positions' probabilities
        # the documentation claims that the values are log probabilities, which seems to be incorrect given that the values are in [0-1]
        start_top_index = outputs[1] 

        # the i-th element of start_top_index, start_top_probs are associated with elements j*model.start_n_top+i (j=1...model.end_n_top) of end_top_index, end_top_probs, where j represents the j-th highest probability end position  
        # and NOT with the i*END_N_TOP+j elements as used here https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py#L639
        # this can be verified by checking summation to unity
        end_top_probs = outputs[2] 
        end_top_index = outputs[3] 
        
        # calculate joint probability of start, end position tuples
        start_end_probs = (start_top_probs.repeat(1, model.end_n_top)*end_top_probs)

        # reshape so that probabilities are ordered by sequence position rather than probability so that we can combine with output of other models
        mapping_to_flat_sequence_position = (end_top_index*torch.tensor(model.start_n_top)).add(start_top_index.repeat(1, model.end_n_top))
        _, indices = torch.sort(mapping_to_flat_sequence_position, dim=1)

        start_end_probs_sorted = start_end_probs[torch.repeat_interleave(torch.arange(start_end_probs.shape[0]), start_end_probs.shape[1]).view(start_end_probs.shape),
                  indices]

        # get (flat) position in sequence of highest probability tuple
        top_start_end_probs_sorted = start_end_probs_sorted.argmax(dim=1)

        # convert flat position to separate start and end positions
        start_top_positions = (top_start_end_probs_sorted % torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
        end_top_positions = (top_start_end_probs_sorted // torch.tensor(config.MAX_LEN)).cpu().detach().numpy()
        
        jaccard_scores = []
        for px, tweet in enumerate(raw_tweet):
            tweet_raw_selected_text = raw_selected[px]
            tweet_sentiment = sentiment[px]
            tweet_offsets = offsets[px]
            tweet_char_map_inverse = char_map_inverse[px]

            start_top_position = start_top_positions[px]
            end_top_position = end_top_positions[px]
            
            cleaned_tweet = orig_tweet[px]
            
            _, output_sentence = calculate_jaccard_score(
                raw_tweet=tweet,
                target_string=tweet_raw_selected_text,
                sentiment_val=tweet_sentiment,
                idx_start=start_top_position,
                idx_end=end_top_position,
                offsets=tweet_offsets,
                char_map_inverse=tweet_char_map_inverse,
                cleaned_tweet=cleaned_tweet
            )
            final_output.append({'text':tweet, 'prediction':output_sentence})
    
    del model, train_dataset, train_data_loader
    gc.collect()

  df_train = df_train.merge(pd.DataFrame(final_output), on='text', how='left')

  return df_train


In [0]:
IN_KAGGLE_COMMIT = False
if (not IN_COLAB) and ('runtime' not in get_ipython().config.IPKernelApp.connection_file):
   IN_KAGGLE_COMMIT = True


print(IN_KAGGLE_COMMIT)

False


# Run

In [0]:
 %%time
 
if IN_COLAB:
    run_training()

if IN_KAGGLE_COMMIT:
    #predict_test()
    #gen_probs_test()
    predict_test_for_voting()

  0%|          | 0/1374 [00:00<?, ?it/s]

Training is Starting for fold=0


100%|██████████| 1374/1374 [25:13<00:00,  1.10s/it, loss=1.09]
100%|██████████| 344/344 [06:25<00:00,  1.12s/it, jaccard=0.699, loss=0.832]


Jaccard = 0.6991815998470465
Loss = 0.832125565934563
Jaccard Score = 0.6991815998470465
Loss score = 0.832125565934563
Validation loss decreased (inf --> 0.832126).  Saving model ...


100%|██████████| 1374/1374 [24:48<00:00,  1.08s/it, loss=0.771]
100%|██████████| 344/344 [06:23<00:00,  1.12s/it, jaccard=0.7, loss=0.806]


Jaccard = 0.7000042308098022
Loss = 0.8063622435020741
Jaccard Score = 0.7000042308098022
Loss score = 0.8063622435020741
Validation loss decreased (0.832126 --> 0.806362).  Saving model ...


100%|██████████| 1374/1374 [24:48<00:00,  1.08s/it, loss=0.654]
100%|██████████| 344/344 [06:24<00:00,  1.12s/it, jaccard=0.703, loss=0.877]
  0%|          | 0/1374 [00:00<?, ?it/s]

Jaccard = 0.7033174365021408
Loss = 0.8773689320340774
Jaccard Score = 0.7033174365021408
Loss score = 0.8773689320340774
EarlyStopping counter: 1 out of 2


100%|██████████| 1374/1374 [24:47<00:00,  1.08s/it, loss=0.552]
100%|██████████| 344/344 [06:22<00:00,  1.11s/it, jaccard=0.702, loss=0.96]

Jaccard = 0.7022890720447954
Loss = 0.9597559294716239
Jaccard Score = 0.7022890720447954
Loss score = 0.9597559294716239
EarlyStopping counter: 2 out of 2
Early stopping
Mean val loss: 0.8063622435020741
CPU times: user 16min 59s, sys: 3min 52s, total: 20min 51s
Wall time: 2h 5min 48s





In [0]:
get_cv_loss()

Evaluating fold=0


100%|██████████| 344/344 [10:26<00:00,  1.82s/it, jaccard=0.696, loss=0.817]


Jaccard = 0.6963852003172368
Loss = 0.8167823214284544
Evaluating fold=1


100%|██████████| 344/344 [10:29<00:00,  1.83s/it, jaccard=0.704, loss=0.823]


Jaccard = 0.7036242974222892
Loss = 0.8227394022795831
Evaluating fold=2


100%|██████████| 344/344 [10:29<00:00,  1.83s/it, jaccard=0.703, loss=0.814]


Jaccard = 0.7031660023647034
Loss = 0.8143632163387199
Evaluating fold=3


100%|██████████| 344/344 [10:30<00:00,  1.83s/it, jaccard=0.704, loss=0.811]


Jaccard = 0.7039190932883281
Loss = 0.8110352138591472
Evaluating fold=4


100%|██████████| 344/344 [10:29<00:00,  1.83s/it, jaccard=0.698, loss=0.839]

Jaccard = 0.6977646500076556
Loss = 0.8390276559575677
Mean val loss: 0.7009718486800426





In [0]:
# test_df = pd.read_csv(config.TESTING_FILE).set_index("textID")

# sub_df = pd.read_csv(config.SUBMISSION_FILE).set_index("textID")

# # Everything not presented in the public set 
# # will take a value of the original text
# test_df["selected_text"] = test_df.text

# # Get the public ids and assign them
# public_idxs = sub_df.index.values
# test_df.loc[public_idxs, "selected_text"] = sub_df.selected_text.values
# test_df[["selected_text"]].to_csv("submission.csv")

In [0]:
# train_df = pd.read_csv(config.TRAINING_FILE)

In [0]:
# output = predict_train()#1000)
# output.to_csv(config.MODEL_PATH / 'train_predictions.csv', index=False)

In [0]:
# output = pd.read_csv(config.MODEL_PATH / 'train_predictions.csv')

# Scratch

In [0]:
# dfx = pd.read_csv(config.TRAINING_FILE)

# train_dataset = TweetDataset(
#     tweet=dfx.text.values,
#     sentiment=dfx.sentiment.values,
#     selected_text=dfx.selected_text.values
# )

# train_data_loader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size=config.TRAIN_BATCH_SIZE,
#     num_workers=4
# )

# words = []

# for t in train_dataset:
#   t_words = t['orig_tweet'].split()
#   words += t_words

# from collections import Counter
# words_counter = Counter(words)
# words_counter_df = pd.DataFrame.from_dict(words_counter, orient='index', columns=['count']).reset_index()

In [0]:
# train_dataset[241]['orig_tweet']

In [0]:
# fold = 0

# dfx = pd.read_csv(config.TRAINING_FILE)

# df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
# df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

# train_dataset = TweetDataset(
#     tweet=df_train.text.values,
#     sentiment=df_train.sentiment.values,
#     selected_text=df_train.selected_text.values
# )

# train_data_loader = torch.utils.data.DataLoader(
#     train_dataset,
#     batch_size=config.TRAIN_BATCH_SIZE,
#     num_workers=4
# )

In [0]:
# train_data_loader.dataset.shape

In [0]:
# len(df_train.iloc[1,:].text)

In [0]:
# tweet = " We've just 16ï¿½C today&amp;cold wind..  Want it 2b like 25ï¿½ to 30ï¿½! I love hot weather! But I reaped the 1st strawberry yday!"
# clean_tweet = " We've just 16ï¿½C today&cold wind..  Want it To be like 25ï¿½ to 30ï¿½! I love hot weather! But I reaped the 1st strawberry yday!"
# selected_text = "We've just 16ï¿½C today&amp;cold wind..  Want it 2b like 25ï¿½ to 30ï¿½! I love hot weather! But I reaped the 1st strawberry yd"
# #char_map = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128]
# len(clean_tweet)
# #char_map_inverse = pd.Series([max([j for j,k in enumerate(char_map) if k==i], default=None) for i in range(len(clean_tweet))]).fillna(method='backfill').fillna(len(tweet)-1).astype(int).values.tolist()
# #print([tweet[e] for e in char_map_inverse])
# #print(len(tweet))
# print(len(tweet))

In [0]:
# char_targets = list(range(len(tweet)))
# char_targets = [0, 1, 2]

In [0]:
# char_targets[200:210]

In [0]:
# char_map_inverse = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 51, 51, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130]
# len(char_map_inverse)

In [0]:
# print(df_valid.pipe(lambda x:x[x.text==" We've just 16ï¿½C today&amp;cold wind..  Want it 2b like 25ï¿½ to 30ï¿½! I love hot weather! But I reaped the 1st strawberry yday!"]).iloc[0].selected_text[-1])

In [0]:
# tweet, selected_text

In [0]:
# #selected_text = 
# tweet = data['raw_tweet']
# selected_text = data['raw_selected_text']
# sentiment = 'neutral'
# tokenizer = config.TOKENIZER
# max_len = config.MAX_LEN
# slang_dict = config.SLANG_DICT
# data = process_data(tweet, selected_text, sentiment, tokenizer, max_len, slang_dict)
# print(data)