<a href="https://colab.research.google.com/github/ShibusawaShunya/NLP.Ginza/blob/main/KoyuTyushutu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ginza==4.0.5

In [3]:
#固有表現抽出

import spacy
from spacy import displacy

nlp = spacy.load('ja_ginza')
doc = nlp('山田さんと銀座でランチをご一緒しましょう。')

for entity in doc.ents:
  print(
        entity.text+' , '+
        entity.label_+' , '+
        str(entity.start_char)+' , '+
        str(entity.end_char))
  
displacy.render(doc, style='ent', jupyter=True)

山田 , Person , 0 , 2
さん , Title_Other , 2 , 4
銀座 , City , 5 , 7


In [None]:
#固有表現抽出モデルの学習

%%time
import spacy
import random

def train_ner(train_data, epoch):
  nlp = spacy.blank('ja')

  if 'ner' not in nlp.pipe_names:
    ner =nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

  for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    for itn in range(epoch):

      random.shuffle(train_data)

      losses = {}
      for text, annotations in train_data:
          nlp.update([text], [annotations], drop=0.2, sgd=optimizer, losses=losses)
      print('iteration{}: {:.8f}' .format(itn, losses['ner']))
  return nlp

train_data = [
              ('入院している母のお見舞いに行ったサツキとメイはオバケ屋敷のことを報告した。',
               {'entities': [(16, 19, 'Person'), (20, 22, 'Perspn')]}),
              ('サツキとメイが森のバス停で雨の中父の帰りを待っている。',
               {'entities': [(0, 3, 'Person'), (4, 6, 'Perspn')]}),
              ('一人で遊んでいたメイは庭で不思議な生き物を見つけた。', 
               {'entities': [(8, 10, 'Person')]}),
              ('人が住み始めるといつのまにかいなくなるという話を聞いてサツキは拍子抜けした。',
               {'entities': [(27, 30, 'Person')]}),
]

nlp = train_ner(train_data, 50)

nlp.to_disk('ner_model')

In [3]:
#学習データを使用した固有抽出表現

doc = nlp('サツキと妹のメイは、母の療養のために父と一緒に農村へ引っ越してきた。')
for ent in doc.ents:
  print(
      ent.text+' , '+
      ent.label_+' , '+
      str(ent.start_char)+' , '+
      str(ent.end_char)
  )

In [None]:
#wikipediaを用いた固有表現抽出モデルの学習

%%time
import spacy
import random

def train_ner(train_data, epoch):
  nlp = spacy.blank('ja')

  if 'ner' not in nlp.pipe_names:
    ner =nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

  for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    for itn in range(epoch):

      random.shuffle(train_data)

      losses = {}
      for text, annotations in train_data:
          nlp.update([text], [annotations], drop=0.2, sgd=optimizer, losses=losses)
      print('iteration{}: {:.8f}' .format(itn, losses['ner']))
  return nlp

import json
labels = {
    '人名': 'Person',
    '法人名': 'Juridical_Person',
    '政治的組織名': 'Political_Organization',
    'その他の組織名': 'Organization_Other',
    '地名': 'Location',
    '施設名': 'Facility',
    '製品名': 'Product',
    'イベント名': 'Event',
}
json_data = json.load(open('ner.json', 'r'))
train_data = []
for data in json_data:
  text = data['text']
  entities = data['entities']
  value = []
  for entity in entities:
    span = entity['span']
    label = labels[entity['type']]
    value.append((span[0], span[1], label))
  train_data.append((text, {'entities': value}))

nlp = train_ner(train_data, 50)

nlp.to_disk('ner_model')