In [25]:
import json

with open('contracts.json', 'r') as file:
    data = json.load(file)

formatted_data = []

for i, entry in enumerate(data):
    context = entry['Context']
    qas = []
    for q, a in entry.items():
        if q != "Context" and q != "Date":
            answer_start = context.find(a)
            if answer_start != -1:
                qas.append({
                    "id": str(i+1) + str(len(qas) + 1).zfill(5),
                    "is_impossible": False,
                    "question": q,
                    "answers": [
                        {
                            "text": a,
                            "answer_start": answer_start,
                        }
                    ],
                })
    formatted_data.append({
        "context": context,
        "qas": qas,
    })

In [26]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(formatted_data, test_size=0.05)

with open('contract_data_train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open('contract_data_test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

In [27]:
pip install simpletransformers



In [28]:
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [29]:
with open(r"contract_data_train.json", "r") as read_file:
    train = json.load(read_file)

with open(r"contract_data_test.json", "r") as read_file:
    test = json.load(read_file)

In [30]:
#train_args are the parameters the QuestionAnswerringModel will use
train_args = {
    'overwrite_output_dir': True,
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 3, #25, after experimentations
    "evaluate_during_training_steps": 500,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":16, #batch_size is another important argument
    "train_batch_size": 4,
    "eval_batch_size": 4
}

In [31]:
# model = QuestionAnsweringModel("bert",
#                                "bert-large-uncased-whole-word-masking-finetuned-squad",
#                                args = train_args,
#                                use_cuda=True)

model = QuestionAnsweringModel("roberta",
                               "deepset/roberta-base-squad2",
                               args = train_args,
                               use_cuda=True)

In [32]:
model.train_model(train, eval_data=test)

convert squad examples to features: 100%|██████████| 363/363 [00:05<00:00, 70.08it/s]
add example index and unique id: 100%|██████████| 363/363 [00:00<00:00, 226668.51it/s]


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/91 [00:00<?, ?it/s]


convert squad examples to features:   0%|          | 0/22 [00:00<?, ?it/s][A
convert squad examples to features: 100%|██████████| 22/22 [00:00<00:00, 124.97it/s]

add example index and unique id: 100%|██████████| 22/22 [00:00<00:00, 154047.89it/s]


Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/91 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 22/22 [00:00<00:00, 276.26it/s]

add example index and unique id: 100%|██████████| 22/22 [00:00<00:00, 142840.07it/s]


Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/91 [00:00<?, ?it/s]


convert squad examples to features: 100%|██████████| 22/22 [00:00<00:00, 275.06it/s]

add example index and unique id: 100%|██████████| 22/22 [00:00<00:00, 163029.48it/s]


Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

(273,
 {'global_step': [91, 182, 273],
  'correct': [6, 6, 6],
  'similar': [15, 16, 16],
  'incorrect': [1, 0, 0],
  'train_loss': [1.0523817539215088,
   0.5087957382202148,
   0.0014573931694030762],
  'eval_loss': [-7.046223958333333, -8.192708333333334, -8.219401041666666]})

In [33]:
# Evaluate the model
result, texts = model.eval_model(test)

convert squad examples to features: 100%|██████████| 22/22 [00:00<00:00, 298.46it/s]
add example index and unique id: 100%|██████████| 22/22 [00:00<00:00, 139810.13it/s]


Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

In [34]:
print(result)

{'correct': 6, 'similar': 16, 'incorrect': 0, 'eval_loss': -8.219401041666666}


In [35]:
print(texts)

{'correct_text': {'100003': '$459,802,408', '100010': 'order-dependent', '100011': '', '1200003': '$8,631,282', '1200010': 'firm-fixed-price indefinite-delivery/indefinite-quantity', '1200011': ''}, 'similar_text': {'100001': {'truth': 'BAE Systems Information and Electronic Systems', 'predicted': 'BAE Systems Information and Electronic Systems,', 'question': 'What is the company that was awarded the contract?'}, '100002': {'truth': 'Fort Wayne, Indiana', 'predicted': 'Fort Wayne, Indiana,', 'question': 'Where is the company that was awarded the contract located?'}, '100004': {'truth': 'AN/ARC-231/A Multi-mode Aviation Radio Suite hardware components, repair services and technical/engineering/logistic support', 'predicted': 'AN/ARC-231/A Multi-mode Aviation Radio Suite hardware components, repair services and technical/engineering/logistic support.', 'question': 'What is the contract awarded for?'}, '100005': {'truth': 'determined with each order', 'predicted': 'determined with each or

In [36]:
# Make predictions with the model
to_predict = [
    {
        "context": "BAE Systems Information and Electronic Systems Integration, Merrimack, New Hampshire, has been awarded a $12,008,850 cost-plus-fixed-fee contract to support the Strategic Chaos Engine for Planning, Tactics, Experimentation and Resiliency program. Work will be performed in Arlington, Virginia (45%); Durham, North Carolina (15%); Merrimack, New Hampshire (15%); Burlington, Massachusetts (10%); and Santa Barbara, California (15%), with an expected completion date of November 2025. Fiscal 2024 research, development, test, and engineering funds in the amount of $2,029,998 are being obligated at time of award. The Defense Advanced Research Projects Agency is the contracting activity (HR001124C0422).",
        "qas": [
            {
                "question": "Where will the contract work be performed?",
                "id": "0",
            }
        ],
    }
]

answers, probabilities = model.predict(to_predict, n_best_size=2)
print(answers)

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 126.23it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10727.12it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': '0', 'answer': ['Arlington, Virginia (45%); Durham, North Carolina (15%); Merrimack, New Hampshire (15%); Burlington, Massachusetts (10%); and Santa Barbara, California (15%),']}]
