In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
import json

nlp = spacy.blank("en")
doc_bin_train = DocBin()
doc_bin_dev = DocBin()

with open("ner_data.jsonl", "r", encoding="utf8") as f:
    examples = [json.loads(line) for line in f]

train_data, dev_data = train_test_split(examples, test_size=0.2, random_state=42)

def convert_to_docbin(data, nlp):
    db = DocBin()
    for example in data:
        doc = nlp.make_doc(example["text"])
        ents = []
        for start, end, label in example["entities"]:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    return db

convert_to_docbin(train_data, nlp).to_disk("train.spacy")
convert_to_docbin(dev_data, nlp).to_disk("dev.spacy")

print("✅ train.spacy and dev.spacy created.")

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     50.79    0.00    0.00    0.00    0.00
  0     200        350.77   3015.12   56.66   58.81   54.66    0.57
  0     400        203.65   2115.60   68.99   71.95   66.26    0.69
  0     600        220.82   1969.16   74.00   75.15   72.89    0.74
  0     800        294.96   2209.38   75.17   78.26   72.31    0.75
  0    1000        350.64   2566.09   76.39   77.50   75.30    0.76
  0    1200        387.80   3005.93   78.17   79.23   77.14    0.78
  0    1400        521.67   3489.79   79.38   80.27   78.51    0.79
  0    1600        565.86   4052.01   79.98   80.27   79.69    0.80
  0    1800        698.55   4787.50   80.18

In [None]:
import spacy
nlp_trained = spacy.load("./output/model-best")

In [None]:
text = "Steve Jobs founded Apple in California."
doc = nlp_trained(text)


for ent in doc.ents:
    print(ent.text, ent.label_)


Steve Jobs per
Apple org
California geo


To learn more about accelerating pandas on Colab, see the [10 minute guide](https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/cudf_pandas_colab_demo.ipynb) or
 [US stock market data analysis demo](https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/cudf_pandas_stocks_demo.ipynb).