In [None]:
%pip install datasets argilla gliner -qqq

## Imports

In [1]:
import argilla as rg
from argilla.client.feedback.schemas import SpanValueSchema

from datasets import load_dataset
from gliner import GLiNER

In [2]:
!pip install argilla

Collecting argilla
  Downloading argilla-1.26.0-py3-none-any.whl (420 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m420.4/420.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx<=0.26,>=0.15 (from argilla)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated~=1.2.0 (from argilla)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting numpy<1.24.0 (from argilla)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
Collecting backoff (from argilla)
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting monotonic (from argilla)
  Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)
Collecting httpcore==1.* (from h

## Extract entities

In [None]:
dataset = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train")
sample = dataset.shuffle().select(range(500))

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
model = GLiNER.from_pretrained("urchade/gliner_base")
labels = ["people", "location", "topic", "event"]

entities = model.batch_predict_entities(sample['text'], labels)
entities[0]

## Create Argilla dataset


Setup

In [None]:
rg.init(
    api_url="<your_argilla_url>",
    api_key="admin.apikey"
)

#rg.FeedbackDataset.from_argilla("cosmopedia ner", workspace="admin").delete()
rg_dataset = rg.FeedbackDataset(
    fields=[rg.TextField(name="text", title="Synthetic text"), rg.TextField(name="prompt", title="Prompt to generate text")],
    questions=[rg.SpanQuestion(name="entities", title="Highlight the entities in the text", field="text", labels=labels), rg.LabelQuestion(name="quality", labels=["👍", "👎"])],
    metadata_properties=[
        rg.TermsMetadataProperty(name="audience"),
        rg.IntegerMetadataProperty(name="text_length")
    ]
)



Build records and create dataset

In [None]:
records = [
    rg.FeedbackRecord(
        fields={"text": r['text'], "prompt": r["prompt"]},
        suggestions=[
        {
            "question_name": "entities",
            "value": [
                SpanValueSchema(
                    start=entity['start'],
                    end=entity['end'],
                    label=entity['label']
                )
                for entity in ents
            ],
            "agent": "urchade/gliner_base",
        }
    ],
        metadata={
            "text_length": len(r['text']),
            "audience": r["audience"]
        }
    )
    for r, ents in zip(sample, entities)
]

In [None]:
rg_dataset.add_records(records)

rg_dataset.push_to_argilla(name="cosmopedia ner", workspace="admin")