In [1]:
!pip install -U spacy
!pip install -U spacy spacy-lookups-data  # Ensure lookups are installed
!python -m spacy download en_core_web_sm   # Download language model


Collecting spacy
  Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.4
    Uninstalling spacy-3.7.4:
      Successfully uninstalled spacy-3.7.4
Successfully installed spacy-3.7.5


In [16]:
import spacy
from spacy.training import Example
import json

# Load your labeled data
TRAIN_DATA = [
    {"text": "Fetch Referrals From Primero using the primero adaptor", "entities": [(26, 33, "ADAPTER")]},
    {"text": "Send a text message to case officer with telerivet adaptor", "entities": [(41, 50, "ADAPTER")]},
    {"text": "Add patient to DHIS2 with the dhis2 adaptor", "entities": [(17, 22, "ADAPTER")]},
    {"text": "Get Data from DHIS2", "entities": [(14, 19, "ADAPTER")]},
    {"text": "Fetch submissions from KoboCollect with language-kobotoolbox@latest", "entities": [(29, 57, "ADAPTER")]},
    {"text": "Push the data to the a postgresSQL database with language-postgresql@latest", "entities": [(44, 71, "ADAPTER")]},
    {"text": "Send text message to an admin using language-twilio@0.3.4 with status of sent message", "entities": [(34, 54, "ADAPTER")]},
    {"text": "FHIR standard Data with change", "entities": [(0, 4, "ADAPTER")]},
    {"text": "Send to OpenHIM to route to SHR", "entities": [(8, 15, "ADAPTER")]},
    {"text": "Notify CHW on upload successful", "entities": []},
    {"text": "Fetch immunization data from OpenMRS using the openmrs adaptor.", "entities": [(39,46, "ADAPTER")]},
    {"text": "Send an appointment reminder SMS via Twilio.", "entities": [(32,38, "ADAPTER")]},
    {"text": "Update stock levels in the warehouse management system using the dynamics adaptor.", "entities": [(63,71, "ADAPTER")]},
    {"text": "Get a list of active cases from CommCare.", "entities": [(31,39, "ADAPTER")]},
    {"text": "Aggregate survey responses and create a report in Google Sheets.", "entities": [(55,67, "ADAPTER")]},
    {"text": "The workflow triggers when a new patient is registered in OpenMRS.", "entities": [(63,69, "ADAPTER")]},
    {"text": "Export the data to a CSV file on the SFTP server.", "entities": [(36,40, "ADAPTER")]},
    {"text": "Send an email alert to the supervisor using the mailgun adaptor.", "entities": [(46,53, "ADAPTER")]},
    {"text": "The job polls the inbox for new messages using the IMAP protocol.", "entities": []},
    {"text": "Retrieve patient demographics from the FHIR server.", "entities": [(35,40, "ADAPTER")]},
    {"text": "Push the updated records to the Salesforce CRM.", "entities": [(33,42, "ADAPTER")]},
    {"text": "Create a new task in Asana when a case is closed.", "entities": [(21,26, "ADAPTER")]},
    {"text": "The job runs every hour to synchronize data between OpenMRS and DHIS2.", "entities": [(43,49, "ADAPTER"), (63,68, "ADAPTER")]},
    {"text": "Store the processed data in Azure Blob Storage.", "entities": [(33,46, "ADAPTER")]},
    {"text": "Fetch Referrals From Primero using the primero adaptor.", "entities": [(29,36, "ADAPTER")]},
    {"text": "Send a text message to case officer with telerivet adaptor.", "entities": [(40,48, "ADAPTER")]},
    {"text": "Add patient to DHIS2 with the dhis2 adaptor.", "entities": [(25,30, "ADAPTER")]},
    {"text": "Get Data from DHIS2.", "entities": [(12,17, "ADAPTER")]},
    {"text": "Filter out children under 2.", "entities": []},
    {"text": "Aggregate the data.", "entities": []},
    {"text": "Make a comment on Asana.", "entities": [(18,23, "ADAPTER")]},
    {"text": "Fetch submissions from KoboCollect with language-kobotoolbox@latest.", "entities": [(27,37, "ADAPTER")]},
    {"text": "Push the data to the a postgresSQL database with language-postgresql@latest.", "entities": [(43,54, "ADAPTER")]},
    {"text": "Send text message to an admin using language-twilio@0.3.4 with status of sent message.", "entities": [(28,34, "ADAPTER")]},
    {"text": "FHIR standard Data with change.", "entities": [(0,4, "ADAPTER")]},
    {"text": "Send to OpenHIM to route to SHR.", "entities": [(6,13, "ADAPTER")]},
    {"text": "Notify CHW on upload successful.", "entities": []},

]

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Get the existing NER component
ner = nlp.get_pipe("ner")

# Add your custom "ADAPTER" label
ner.add_label("ADAPTER")

# Get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

# Convert TRAIN_DATA into spaCy's Example format
examples = []
for entry in TRAIN_DATA:
    doc = nlp.make_doc(entry["text"])
    example = Example.from_dict(doc, {"entities": entry["entities"]})
    examples.append(example)

# Training loop
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(30):
        losses = {}
        nlp.update(examples, sgd=optimizer, drop=0.5, losses=losses)
        print(losses)

# Save the trained model
nlp.to_disk("openfn_adaptor_ner")




{'ner': 205.6281982064247}
{'ner': 283.8307536840439}
{'ner': 282.25145041942596}
{'ner': 280.29306530952454}
{'ner': 277.8500675559044}
{'ner': 274.01862996816635}
{'ner': 267.0271247625351}
{'ner': 256.2230362892151}
{'ner': 242.96135061979294}
{'ner': 229.35395222902298}
{'ner': 205.17218679189682}
{'ner': 165.34327974915504}
{'ner': 125.06622034311295}
{'ner': 82.66953518986702}
{'ner': 48.96430659946054}
{'ner': 26.00176438409835}
{'ner': 17.869213992076766}
{'ner': 15.286856625580754}
{'ner': 13.833499010508831}
{'ner': 14.204669126926117}
{'ner': 13.587861774294367}
{'ner': 12.913761688272542}
{'ner': 13.941888034094402}
{'ner': 13.920258282306698}
{'ner': 13.429053491580554}
{'ner': 13.482031210337368}
{'ner': 13.244044155996221}
{'ner': 13.124295731281109}
{'ner': 12.231331663773744}
{'ner': 12.844756205449812}


In [21]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    scored_examples = [] # Empty list to store scored examples
    for example in examples:
        pred_doc = ner_model(example.reference.text)
        example.predicted = pred_doc
        scored_examples.append(example) # Append to the list
    return scorer.score(scored_examples)  # Pass the list of scored examples


# Load the evaluation data
eval_data = [
    {"text": "Fetch Referrals From Primero using the primero adaptor", "entities": [(26, 33, "ADAPTER")]},
    {"text": "Send a text message to case officer with telerivet adaptor", "entities": [(41, 50, "ADAPTER")]},
    {"text": "Add patient to DHIS2 with the dhis2 adaptor", "entities": [(17, 22, "ADAPTER")]},
    {"text": "Get Data from DHIS2", "entities": [(14, 19, "ADAPTER")]},
    {"text": "Fetch submissions from KoboCollect with language-kobotoolbox@latest", "entities": [(29, 57, "ADAPTER")]},
    {"text": "Push the data to the a postgresSQL database with language-postgresql@latest", "entities": [(44, 71, "ADAPTER")]},
    {"text": "Send text message to an admin using language-twilio@0.3.4 with status of sent message", "entities": [(34, 54, "ADAPTER")]},
    {"text": "FHIR standard Data with change", "entities": [(0, 4, "ADAPTER")]},
    {"text": "Send to OpenHIM to route to SHR", "entities": [(8, 15, "ADAPTER")]},
    {"text": "Notify CHW on upload successful", "entities": []}
]

# Load the trained model
nlp = spacy.load("openfn_adaptor_ner")

# Prepare evaluation examples
examples = []
for entry in eval_data:
    doc = nlp.make_doc(entry["text"])
    example = Example.from_dict(doc, {"entities": entry["entities"]})
    examples.append(example)

# Evaluate the model
scores = evaluate(nlp, examples)
print(scores)





{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'ADAPTER': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
