In [184]:
from transformers import BertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer, BertTokenizer
import torch

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [185]:
context = "Engage surface-to-air missiles on green and silver missile at heading two niner five. Target locked, prepare for launch."
question = "Target"


inputs = tokenizer(question, context, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

# Find the tokens with the highest `start` and `end` scores
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0, answer_start:answer_end]))

print("Answer:", answer)



Answer: 


In [8]:
# import json
# with open("/home/jupyter/advanced/nlp.jsonl", "r") as f:
#     instances = [json.loads(line.strip()) for line in f if line.strip() != ""]
# instances[0]
# from datasets import load_dataset
# dataset = load_dataset('json', data_files='/home/jupyter/advanced/nlp.jsonl',split='train')
# dataset.rename_column("transcript", "context")

# dataset = dataset.train_test_split(test_size=0.2)
from datasets import Dataset, DatasetDict
import json
import pandas as pd 

df = pd.read_json("/home/jupyter/advanced/nlp.jsonl", lines=True)
df = df.rename(columns={"transcript": "context"})

df1 = df[['context', 'tool']]
df1['question'] = 'What is the tool used?'
df1 = df1.rename(columns={"tool": "answer"})

df2 = df[['context', 'target']]
df2['question'] = 'What is the target?'
df2 = df2.rename(columns={"target": "answer"})

df_merged = pd.concat([df1, df2], ignore_index=True, sort=False)
dataset = Dataset.from_pandas(df_merged)
def preprocess(instance):
    instance['answers'] = {"text": [instance["answer"]], "answer_start" : [instance['context'].find(instance["answer"])]}
    return instance

dataset = dataset.map(preprocess).remove_columns("answer")
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['question'] = 'What is the tool used?'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['question'] = 'What is the target?'


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [12]:
print(dataset['train'][1]['context'])
print(dataset['train'][1]['question'])
print(dataset['train'][1]['answers'])

Control here, requesting deployment of surface-to-air missiles. Target is an orange commercial aircraft at heading two six zero. Take aim and fire at will. Over.
What is the target?
{'answer_start': [77], 'text': ['orange commercial aircraft']}


In [11]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [190]:
training_args = TrainingArguments(
    output_dir="albert_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.0,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.027826


TrainOutput(global_step=350, training_loss=0.2482257298060826, metrics={'train_runtime': 128.8169, 'train_samples_per_second': 43.473, 'train_steps_per_second': 2.717, 'total_flos': 365815459430400.0, 'train_loss': 0.2482257298060826, 'epoch': 1.0})

In [191]:
trainer.save_model("./albert_model")

In [192]:
context = 'Target the silver, purple, and orange drone at heading one seven zero. Deploy anti-air artillery.'
question = "What is the target?"

# Tokenize the context to find the exact start and end position of the answer
encoded = tokenizer.encode_plus(question, context, return_tensors="pt").to('cuda')
input_ids = encoded["input_ids"].tolist()[0]

model.eval()
with torch.no_grad():
    outputs = model(**encoded)
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print("Improved Answer:", answer)

Improved Answer: silver, purple, and orange drone


In [55]:
import re
 
def convert_to_numbers(s):
    s = s.lower()
    words_to_numbers = {
        'one': '1',
        'two': '2',
        'three': '3',
        'four': '4',
        'five': '5',
        'six': '6',
        'seven': '7',
        'eight': '8',
        'nine': '9',
        'niner': '9',
        'zero': '0'
    }
 
    pattern = re.compile(r'\b(' + '|'.join(words_to_numbers.keys()) + r')\b')
    text = re.sub(pattern, lambda x: words_to_numbers[x.group()], s)
    pattern2 = re.compile(r'\d \d \d')
    text = re.findall(pattern2, text)
    try:
        return text[0].replace(' ', '')
    except Exception as e:
        print(s)
        return '000'
        

extracted = list(map(lambda x: convert_to_numbers(x), dataset['transcript']))


In [59]:
def compare(extracted, original):
    count = 0
    for i in range(len(extracted)):
        if extracted[i] != original[i]:
            count += 1
    return count

print(compare(extracted, dataset['heading']))

0


In [1]:
"""https://huggingface.co/deepset/roberta-base-squad2"""
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 5600
    })
    test: Dataset({
        features: ['context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1400
    })
})

In [14]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [15]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name) 



In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss
1,No log,0.009283
2,0.046800,0.014221
3,0.009100,0.004192


TrainOutput(global_step=1050, training_loss=0.026902887792814346, metrics={'train_runtime': 421.4291, 'train_samples_per_second': 39.864, 'train_steps_per_second': 2.492, 'total_flos': 1097446378291200.0, 'train_loss': 0.026902887792814346, 'epoch': 3.0})

In [18]:
trainer.save_model("nlp_roberta")

In [19]:
model

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay