In [None]:
!python3 -m pip install spacy==3.7.4

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m pip install nertk==0.0.4

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm", enable="tokenizer")

In [None]:
!mkdir data
!wget -P data https://raw.githubusercontent.com/explosion/projects/v3/tutorials/ner_fashion_brands/assets/fashion_brands_training.jsonl
!wget -P data https://raw.githubusercontent.com/explosion/projects/v3/tutorials/ner_fashion_brands/assets/fashion_brands_eval.jsonl

In [None]:
!mkdir scripts
!wget -P scripts https://raw.githubusercontent.com/explosion/projects/v3/tutorials/ner_fashion_brands/scripts/preprocess.py

In [7]:
import srsly
from pprint import pprint

training_path = "data/fashion_brands_training.jsonl"

for row in srsly.read_jsonl(training_path):
    pprint(row)
    break

{'_input_hash': 1653937787,
 '_session_id': None,
 '_task_hash': -1474793941,
 '_view_id': 'ner_manual',
 'answer': 'accept',
 'meta': {'section': 'malefashionadvice'},
 'spans': [],
 'text': "It's all preference for which looks better, personally I feel that "
         'the more natural the hair looks the better the style, which for me '
         'means going with a matte finish which leaves the hair looking as '
         'natural as possible while still holding it in place',
 'tokens': [{'end': 2, 'id': 0, 'start': 0, 'text': 'It'},
            {'end': 4, 'id': 1, 'start': 2, 'text': "'s"},
            {'end': 8, 'id': 2, 'start': 5, 'text': 'all'},
            {'end': 19, 'id': 3, 'start': 9, 'text': 'preference'},
            {'end': 23, 'id': 4, 'start': 20, 'text': 'for'},
            {'end': 29, 'id': 5, 'start': 24, 'text': 'which'},
            {'end': 35, 'id': 6, 'start': 30, 'text': 'looks'},
            {'end': 42, 'id': 7, 'start': 36, 'text': 'better'},
            {'end

In [8]:
nertk_input_text = []

for row in srsly.read_jsonl(training_path):
    comment = nlp(row["text"])
    comment_words = [token.text for token in comment]
    nertk_input_text.append(comment_words)

In [9]:
from nertk import Entator

annotator = Entator(labels=['None', 'FASHION_BRAND'],
inputs=nertk_input_text)

annotator.run()

VBox(children=(Label(value='Current Label: FASHION_BRAND'), HBox(children=(Button(description='None', layout=L…

Output()

In [None]:
from spacy.tokens import DocBin, Span
from utils import create_consecutive_token_sequences

db = DocBin()
for idx, (row, nerkt_tokens, nertk_entities) in enumerate(zip(srsly.read_jsonl(training_path), nertk_input_text, annotator.targets)):
  if idx == 5:
        break
  doc = nlp(row["text"])
  indexes_entity_tokens = [index for index, x in enumerate(nertk_entities) if x == "FASHION_BRAND"]
  span_indexes = create_consecutive_token_sequences(indexes_entity_tokens)
  ents = []
  label = "FASHION_BRAND"
  for start,end in span_indexes:
    span = Span(doc, start, end+1, label)
    ents.append(span)
  doc.ents = ents
  db.add(doc)
db.to_disk("data/nertk_training.spacy")

In [11]:
!python -m spacy init config cpu_config.cfg --lang "en" --pipeline "ner" --optimize "efficiency"

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
cpu_config.cfg
You can now add your data and train your pipeline:
python -m spacy train cpu_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
!python ./scripts/preprocess.py ./data/fashion_brands_training.jsonl ./data/fashion_brands_training.spacy

Processed 1235 documents: fashion_brands_training.spacy


In [13]:
!python ./scripts/preprocess.py ./data/fashion_brands_eval.jsonl ./data/fashion_brands_eval.spacy

Processed 500 documents: fashion_brands_eval.spacy


In [14]:
!python -m spacy train cpu_config.cfg --output training_cpu/ --paths.train ./data/fashion_brands_training.spacy --paths.dev ./data/fashion_brands_eval.spacy

[38;5;2m✔ Created output directory: training_cpu[0m
[38;5;4mℹ Saving to output directory: training_cpu[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     46.17    1.38    0.70   55.88    0.01
  0     200         31.13    824.92   25.08   53.42   16.39    0.25
  0     400         66.06    342.21   47.71   55.93   41.60    0.48
  1     600         71.70    233.36   55.26   66.08   47.48    0.55
  1     800        195.00    219.52   58.55   66.14   52.52    0.59
  2    1000        157.87    162.29   59.58   66.15   54.20    0.60
  3    1200      25495.90    279.75   62.44   74.42   53.78    0.62
  4    1400        622.63    112.01   64.92   75.14   57.14 

In [15]:
!python -m spacy evaluate training_cpu/model-best ./data/fashion_brands_eval.spacy --output training_cpu/metrics.json

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     99.98
NER P   75.14
NER R   57.14
NER F   64.92
SPEED   27306

[1m

                    P       R       F
FASHION_BRAND   75.14   57.14   64.92

[38;5;2m✔ Saved results to training_cpu/metrics.json[0m


In [16]:
!python -m spacy init config gpu_config.cfg -l "en" -p "ner" --optimize "accuracy" --gpu

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: GPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
gpu_config.cfg
You can now add your data and train your pipeline:
python -m spacy train gpu_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m pip install spacy-transformers==1.3.5

In [None]:
!python -m spacy download en_core_web_lg

In [23]:
!python -m spacy train gpu_config.cfg --output training_gpu/ --paths.train ./data/fashion_brands_training.spacy --paths.dev ./data/fashion_brands_eval.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: training_gpu[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     46.17    1.53    0.77   60.08    0.02
  0     200          7.50    592.19   51.11   45.70   57.98    0.51
  0     400        206.80    387.08   17.20   58.54   10.08    0.17
  1     600         34.33    319.89   60.75   68.42   54.62    0.61
  1     800        130.67    315.17   66.52   68.92   64.29    0.67
  2    1000       1692.92    279.95   59.50   54.77   65.13    0.60
  3    1200       3226.35    288.68   67.52   68.70   66.39    0.68
  4    1400      22818.35    286.93   67.49   66.53   68.49    0.67
  6    1600       5428.51  

In [24]:
!python -m spacy evaluate training_gpu/model-best ./data/fashion_brands_eval.spacy --output training_gpu/metrics.json --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[1m

TOK     99.98
NER P   79.27
NER R   64.29
NER F   71.00
SPEED   11122

[1m

                    P       R       F
FASHION_BRAND   79.27   64.29   71.00

[38;5;2m✔ Saved results to training_gpu/metrics.json[0m


In [25]:
from spacy import displacy

nlp = spacy.load('training_gpu/model-best')

sentence = "Givenchy is looking at buying U.K. startup for $1 billion"
doc = nlp(sentence)
displacy.render(doc, style="ent", jupyter=True)



In [26]:
nlp = spacy.load('en_core_web_sm')

sentence = "Givenchy is looking at buying U.K. startup for $1 billion"
doc = nlp(sentence)
displacy.render(doc, style="ent", jupyter=True)

In [None]:
!python -m spacy package training_gpu/model-best ./ --name "ner_fashion_brands"

In [None]:
!python -m pip install en_ner_fashion_brands-0.0.0/

In [29]:
import en_ner_fashion_brands
from spacy import displacy

nlp = en_ner_fashion_brands.load()
sentence = "Givenchy is looking at buying U.K. startup for $1 billion"

doc = nlp(sentence)
displacy.render(doc, style="ent", jupyter=True)

In [38]:
!python -m spacy assemble combined_ner.cfg pipelines/fashion_ner_with_base_entities

[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Initialized pipeline[0m
[1m


In [39]:
import spacy
from spacy import displacy

nlp = spacy.load("pipelines/fashion_ner_with_base_entities")

sentence = "Givenchy is looking at buying U.K. startup for $1 billion"
doc = nlp(sentence)

displacy.render(doc, style="ent", jupyter=True)