# Dataset generation

## NER model

In [48]:
from transformers import BartTokenizerFast, BartForConditionalGeneration

In [49]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

Downloading:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [50]:
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

In [51]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

In [None]:
!wget https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt

In [52]:
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')

In [53]:
text = text[:50]

In [54]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [55]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [56]:
tokenizer.convert_ids_to_tokens(101)

'Ġlike'

In [59]:
from tqdm import tqdm

def get_mask(inputs):
  mask = []
  for sentense in tqdm(inputs['input_ids']):    
    text_tok = tokenizer.convert_ids_to_tokens(sentense)
    #print(text_tok)
    text_tok = [tok.replace("Ġ", "") for tok in text_tok if tok not in ['<s>','</s>','<pad>']] #<s> </s>?
    #print(text_tok)
    ner_results = nlp(" ".join(text_tok))
    res = [False] * 512
    for entity in ner_results:
      try:
        res[text_tok.index(entity['word'])] = True
      except ValueError:
        pass
    #print(ner_results)
    mask.append(res)
  return mask

In [60]:
mask_arr = get_mask(inputs)
mask_arr = torch.Tensor(mask_arr)

100%|██████████| 50/50 [00:31<00:00,  1.61it/s]


In [61]:
mask_arr.size()

torch.Size([50, 512])

In [62]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [63]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [64]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [65]:
dataset = MeditationsDataset(inputs)

In [66]:

loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)

In [67]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [68]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [69]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  """
Epoch 0: 100%|██████████| 25/25 [07:35<00:00, 18.22s/it, loss=3.43]
Epoch 1: 100%|██████████| 25/25 [07:33<00:00, 18.13s/it, loss=1.5]
