In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
google_drive_path = '/content/drive/MyDrive/XAI/'

# TensorFlow roBERTa Starter - LB 0.705
This notebook is a TensorFlow template for solving Kaggle's Tweet Sentiment Extraction competition as a question and answer roBERTa formulation. In this notebook, we show how to tokenize the data, create question answer targets, and how to build a custom question answer head for roBERTa in TensorFlow. Note that HuggingFace transformers don't have a `TFRobertaForQuestionAnswering` so we must make our own from `TFRobertaModel`. This notebook can achieve LB 0.715 with some modifications. Have fun experimenting!

You can also run this code offline and it will save the best model weights during each of the 5 folds of training. Upload those weights to a private Kaggle dataset and attach to this notebook. Then you can run this notebook with the line `model.fit()` commented out, and this notebook will instead load your offline models. It will use your offline models to predict oof and predict test. Hence this notebook can easily be converted to an inference notebook. An inference notebook is advantageous because it will only take 10 minutes to commit and submit instead of 2 hours. Better to train 2 hours offline separately.

# Load Libraries, Data, Tokenizer
We will use HuggingFace transformers [here][1]

[1]: https://huggingface.co/transformers/

In [None]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers

print('TF version',tf.__version__)



TF version 2.15.0


In [None]:
MAX_LEN = 96
# PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab=google_drive_path+'input/tf-roberta/vocab-roberta-base.json',
    merges=google_drive_path+'input/tf-roberta/merges-roberta-base.txt',
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
train = pd.read_csv(google_drive_path+'/input/tweet-sentiment-extraction/train.csv').fillna('')
test = pd.read_csv(google_drive_path+'/input/tweet-sentiment-extraction/test.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# Training Data
We will now convert the training data into arrays that roBERTa understands. Here are example inputs and targets:
![ids.jpg](attachment:ids.jpg)
The tokenization logic below is inspired by Abhishek's PyTorch notebook [here][1].

[1]: https://www.kaggle.com/abhishek/roberta-inference-5-folds

### Prepare dataset for implementing Question Answering Transformers Model

In [None]:
train_qa = train.copy()
test_qa = test.copy()

In [None]:
# Add column question and answer_start to the dataset
train_qa['question'] = 'Why is this sentiment '+train_qa['sentiment']+'?'
test_qa['question'] = 'Why is this sentiment '+test_qa['sentiment']+'?'

train_qa['answer_start'] = np.nan

for k in range(train_qa.shape[0]):
    text1 = " "+" ".join(train_qa.loc[k,'text'].split())
    text2 = " ".join(train_qa.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    idx = idx - 1
    train_qa.loc[k,'answer_start'] = idx

In [None]:
train_qa['answer_start'] = train_qa['answer_start'].astype(int)
train_qa.head()

Unnamed: 0,textID,text,selected_text,sentiment,question,answer_start
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,Why is this sentiment neutral?,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,Why is this sentiment negative?,0
2,088c60f138,my boss is bullying me...,bullying me,negative,Why is this sentiment negative?,11
3,9642c003ef,what interview! leave me alone,leave me alone,negative,Why is this sentiment negative?,16
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,Why is this sentiment negative?,0


In [None]:
test_qa.head()

Unnamed: 0,textID,text,sentiment,question
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Why is this sentiment neutral?
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,Why is this sentiment positive?
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,Why is this sentiment negative?
3,01082688c6,happy bday!,positive,Why is this sentiment positive?
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,Why is this sentiment positive?


### Implement Transformer question answering
https://huggingface.co/docs/transformers/tasks/question_answering


https://medium.com/mlearning-ai/question-answering-in-association-with-roberta-a11518e70507

In [None]:
from transformers import AutoTokenizer

tokenizer_qa = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "s

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer_qa(
        questions,
        examples["text"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        if start_char >= 0:  # Add this condition for cases that have empty text & selected_text
          while sequence_ids[idx] != 1:
              idx += 1
          context_start = idx
          while sequence_ids[idx] == 1:
              idx += 1
          context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char or start_char < 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# pip install datasets

In [None]:
def convert_answers(r):
  start = r[0]
  text = r[1]
  return {
      'answer_start': [start],
      'text': [text]
  }

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)
for fold, (train_idx, val_idx) in enumerate(skf.split(train_qa, train_qa['sentiment'])):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)

    if (fold == 0):
        continue;
    elif (fold == 2):
        break;

    train_df = train_qa.iloc[train_idx].copy()
    validation_df = train_qa.iloc[val_idx].copy()

    # train = train.sample(frac=1, random_state=42)
    train_df['answers'] = train_df[['answer_start', 'selected_text']].apply(convert_answers, axis=1)
    validation_df['answers'] = validation_df[['answer_start', 'selected_text']].apply(convert_answers, axis=1)

    from datasets import Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(validation_df)

    tokenized_train_ds = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    tokenized_valid_ds = valid_dataset.map(preprocess_function, batched=True, remove_columns=valid_dataset.column_names)

#########################
### FOLD 1
#########################
#########################
### FOLD 2
#########################


Map:   0%|          | 0/21985 [00:00<?, ? examples/s]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

#########################
### FOLD 3
#########################


In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

access_token = "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"

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased", token=access_token)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized

In [None]:
# pip install transformers[torch]

In [None]:
import accelerate

accelerate.__version__

'0.26.1'

In [None]:
# !pip install huggingface_hub
from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')

In [None]:
training_args = TrainingArguments(
    output_dir=google_drive_path+"fold2/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_strategy="epoch",
    weight_decay=0.01
)

fold2_model = AutoModelForQuestionAnswering.from_pretrained(google_drive_path+"fold2/checkpoint-4000/")
trainer = Trainer(
    model=fold2_model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    tokenizer=tokenizer_qa,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file /content/drive/MyDrive/XAI/fold2/checkpoint-4000/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/XAI/fold2/checkpoint-4000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}

In [None]:
tf.experimental.numpy.experimental_enable_numpy_behavior()

In [None]:
trainer.train()


***** Running training *****
  Num examples = 21,985
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4,125
  Number of trainable parameters = 66,364,418


Epoch,Training Loss,Validation Loss
1,0.9202,1.33694


***** Running Evaluation *****
  Num examples = 5496
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/XAI/fold2/checkpoint-1375
Configuration saved in /content/drive/MyDrive/XAI/fold2/checkpoint-1375/config.json
Model weights saved in /content/drive/MyDrive/XAI/fold2/checkpoint-1375/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/XAI/fold2/checkpoint-1375/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/XAI/fold2/checkpoint-1375/special_tokens_map.json


KeyboardInterrupt: 

In [None]:
trainer.save_model(google_drive_path+"fold2/my-fold2-model/")

### Metrics

In [None]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

### Evaluate model with Jaccard

In [None]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained(google_drive_path+"fold2/checkpoint-4000")
fold2_model = AutoModelForQuestionAnswering.from_pretrained(google_drive_path+"fold2/checkpoint-4000")

all = []
all_st = []
jac = []
for example in valid_dataset:
  question = example['question']
  context = example['text']
  inputs = tokenizer(question, context, return_tensors="pt")

  with torch.no_grad():
    outputs = fold2_model(**inputs)

  answer_start_index = outputs.start_logits.argmax()
  answer_end_index = outputs.end_logits.argmax()

  if answer_start_index>answer_end_index:
    st = example['text'] # IMPROVE CV/LB with better choice here
  else:
    # text1 = " "+" ".join(context.split())
    # enc = tokenizer.encode(text1)
    # st = tokenizer.decode(enc.ids[a-1:b])
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    st = tokenizer.decode(predict_answer_tokens)
  all_st.append(st)
  all.append(jaccard(st,example['selected_text']))
jac.append(np.mean(all))
print('>>>> Jaccard = '%jac)
print()

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file /content/drive/MyDrive/XAI/fold2/checkpoint-4000/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/XAI/fold2/checkpoint-4000",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}

loading weights file /content/drive/MyDrive/XAI/fold2/checkpoint-4000/model.safetensors
All model checkpoint weights were used when initializ

>>>> Jaccard = 



In [None]:
print(jac)

[0.5593737215564758]


### Prepare dataset for testing

In [None]:
def prepare_validation_features(examples):

    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer_qa(
        questions,
        examples["text"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    inputs["example_id"] = []
    for i in range(len(inputs["input_ids"])):

        sequence_ids = inputs.sequence_ids(i)
        context_index = 1

        sample_index = sample_mapping[i]
        inputs["example_id"].append(examples["textID"][sample_index])

        inputs["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(inputs["offset_mapping"][i])
        ]
    return inputs

In [None]:
test_dataset = Dataset.from_pandas(test_qa)

In [None]:
test_features = test_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

### Show selected text for test dataset

In [None]:
all_st_test = []
for example in test_dataset:
  question = example['question']
  context = example['text']
  inputs = tokenizer(question, context, return_tensors="pt")

  with torch.no_grad():
    outputs = fold2_model(**inputs)

  answer_start_index = outputs.start_logits.argmax()
  answer_end_index = outputs.end_logits.argmax()

  if answer_start_index>answer_end_index:
    st = example['text'] # IMPROVE CV/LB with better choice here
  else:
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    st = tokenizer.decode(predict_answer_tokens)
  all_st_test.append(st)


In [None]:
test['selected_text'] = all_st_test
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
325,0142108215,Thank you so much phaoloo !!!!,positive,[SEP] thank you so much
2226,fb08563a7b,Midnight ice-cream weather! So **** bored,negative,so * * * * bored
2480,9a2c6ae21c,Ohh i forgot to tell you last night that when i was a al...,positive,was amazing
1862,5de3c34293,i am the only arabic girl who`s online every one is a ...,neutral,i am the only arabic girl who ` s online every one is a ...
299,cf0d831059,starting the video editing of the first spanking movie w...,neutral,starting the video editing of the first spanking movie w...
1637,2d361cdd4d,I have it. Hehehehe u want the torrent?,neutral,[SEP] i have it. hehehehe u want the torrent
2374,cb4adb2254,I wanna do something tonight after work.... But I dunno ...,neutral,i wanna do something tonight after work.... but i dunno ...
3355,32cde6dbc5,in school w. linda doing nothing ;i miss you,negative,i miss you
139,3f0f5891ab,love your books,positive,love your books
1456,02073f1c62,I wanna crawl under my desk and take a nap nvrmind its ...,negative,its dirty


### SHAP QA + Hugging Face

In [None]:
# pip install shap

Collecting shap
  Downloading shap-0.44.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (535 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m535.7/535.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.44.1 slicer-0.0.7


In [None]:
question_answerer = pipeline("question-answering", model=google_drive_path+'fold2/checkpoint-4000/')

loading configuration file /content/drive/MyDrive/XAI/fold2/checkpoint-4000/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/XAI/fold2/checkpoint-4000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}

loading configuration file /content/drive/MyDrive/XAI/fold2/checkpoint-4000/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/drive/MyDrive/XAI/fold2/checkpoint-4000/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dr

In [None]:
def make_answer_scorer(answers):
    def f(questions):
        out = []
        for q in questions:
            question, context = q.split("[SEP]")
            results = question_answerer(question, context, topk=20)
            values = []
            for answer in answers:
                value = 0
                for result in results:
                    if result["answer"] == answer:
                        value = result["score"]
                        break
                values.append(value)
            out.append(values)
        return out

    f.output_names = answers
    return f

In [None]:
print(new_string)
print(train_dataset[19]['selected_text'].split())

Why is this sentiment positive?[SEP]the free fillin` app on my ipod is fun, im addicted
['the', 'free', 'fillin`', 'app', 'on', 'my', 'ipod', 'is', 'fun,', 'im', 'addicted']


In [None]:
import shap

new_string = train_dataset[19]['question'] + "[SEP]" + train_dataset[19]['text']

our_train_data = []
our_train_data.append(new_string)

f_answers = make_answer_scorer(train_dataset[19]['selected_text'].split())
explainer_answers = shap.Explainer(f_answers, tokenizer)
shap_values_answers = explainer_answers(our_train_data)


shap.plots.text(shap_values_answers)

<class 'list'>


topk parameter is deprecated, use top_k instead


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:43, 43.28s/it]               


# Kaggle Submission

In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b:
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [None]:
test['selected_text'] = all
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)