<a href="https://colab.research.google.com/github/Sharaborina/ChatBot/blob/main/Spacy_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Development of NER model based on pretrained Spacy model for Danish langauage and MultiWoz dataset

Here a model for extraction different entities (date, city, name of hotel, parking etc. ) is presented. For further training of Spacy model `da_core_news_lg `, the MultiWoz dataset was chosen thanks to the fact that all dialogues were marked up.

##1.1 Connection to the Google drive, installation of libraries and importing necessary libraries

In [1]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip uninstall -y  spacy --quiet
!pip install spacy --quiet
!python -m spacy download da_core_news_lg >out 2> log
!pip install dateparser deep_translator --quiet


In [3]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

import json
import random
import numpy as np
from termcolor import colored
from deep_translator import GoogleTranslator
import dateparser

##1.2 Definition of global variables 

In [4]:
# filename of the MultiWOZ dialogue dataset
ENGLISH_DATA_FILE = 'data.json'
DANISH_DATA_FILE = 'danish_data.json'
# data directory
DATA_DIR = '/content/drive/MyDrive/ColabNotebooks/MultiWoz/'
# dictionary where we will load the dialogue dataset
DIALOGUE_ENG_DB = {}
DIALOGUE_DB = {}
# vocabulary filename
VOCAB_FILE = 'en_50k_pruned.subword'
# vocabulary file directory
VOCAB_DIR = '/content/drive/MyDrive/ColabNotebooks/MultiWoz/'
VOCAB_SIZE = 50000 #33000

# load a cleaned translated train data from file DATA_DIR/TRAIN_FILENAME, 
# otherwise  generate a train data from DIALOGUE_ENG_DB
LOAD_TRAIN_DATA = False
# train filename
TRAIN_FILENAME = 'label_and_train_data.json'
# path to pretrained NER_model
NER_MODEL_DIR = '/content/drive/MyDrive/ColabNotebooks/Spacy_NER_Model'



USE_PRETRAINED_MODEL = True #use pretrained model, otherwise start from a blank model
LOAD_MODEL = True #load pretrained model
TRAIN = True
N_LAYERS = 6
TRAIN_STEPS = 100

#1.3 Loading of MultoWoz dataset, which is saved in json format

In [5]:
# help function to load a JSON file
def load_json(directory, file):
    with open(f'{directory}/{file}') as file:
        db = json.load(file)
    return db

def upload_json(directory, file, db):
    with open(f'{directory}/{file}', mode='w') as file: 
        json.dump(db, file).encode('utf8')

# load the dialogue data set into our dictionary
DIALOGUE_ENG_DB = load_json(DATA_DIR, ENGLISH_DATA_FILE)

#2.1 Development utility function for extraction conversation from the dataset and machine translation

In [6]:
def get_conversation(file, data_db):
    '''
    Args:
        file (string): filename of the dialogue file saved as json
        data_db (dict): dialogue database
    
    Returns:
        string: A string containing the 'text' fields of  data[file]['log'][x]
    '''
    
    # initialize empty string
    result = ''
    
    # get length of file's log list
    len_msg_log = len(data_db[file]['log'])
    
    # set the delimiter strings
    delimiter_1 = ' Person 1: '
    delimiter_2 = ' Person 2: '
    
    # loop over the file's log list
    for i in range(len_msg_log):
        
    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    
        # get i'th element of file log list
        cur_log = data_db[file]['log'][i]['text']
        
        # check if i is even
        if i%2 == 0:                   
            # append the 1st delimiter string
            result += delimiter_1
        else: 
            # append the 2nd delimiter string
            result += delimiter_2
        
        # append the message text from the log
        result += cur_log
    
    ### END CODE HERE ###

    return result

#Translate:
# Translate text
def translate(text, target='da'):
  if type(text) == list:
    out = []
    for t in text:
      out.append(GoogleTranslator(source='auto', target=target).translate(t))
    return out
  return GoogleTranslator(source='auto', target=target).translate(text)

def translate_db(data_db):
  for file, val in data_db.items():
    for i in range(len(data_db[file]['log'])):
      src = data_db[file]['log'][i]['text']
      # print('src=', src)
      target = GoogleTranslator(source='auto', target='da').translate(src)
      data_db[file]['log'][i]['text'] = target
      # print(target)
  return data_db

# DIALOGUE_DB = translate_db(DIALOGUE_ENG_DB)

# upload_json(DATA_DIR, DANISH_DATA_FILE, DIALOGUE_DB)

functions for string processing

In [7]:
import re
# recognize_word returns the start and end indexes of all occurence of a word in a sentence.
def recognize_word(sentence, word):
  output = []
  
  for match in re.finditer(word, sentence, flags=re.IGNORECASE):
    output.append((match.start(), match.end()))
    # print (match.start(), match.end())
  return output

# check overlaps of [start3, end3] and [[start1, end1],[start2, end2],... ] lists
def words_overlap(slice1, slice2):
  """Take two strings representing slices (e.g. 'x:y') and
  return a boolean indicating whether they overlap"""
  check = [None]*len(slice2)
  for i,item in enumerate(slice2):
    if slice1[0] < item[0]:  # slice1 is leftmost
      check[i] = item[0] < slice1[1]  # item ends before slice1 starts
    else:
      check[i] = slice1[0] < item[1]
    if check[i]:
      return True
  return False

In [8]:
# it means [0,3] overlapes with some list(s).
words_overlap([0,3],[[5,6],[7,8],[3,4]])

False

In [9]:
# it finds out the position of a word '5. januar' in a sentence 
recognize_word('Jeg vil reservere et hotel i Moskva fra 1. januar til 5. januar.', '5. januar')

[(54, 63)]

#2.2 Converting DIALOGUE_ENG_DB to TRAIN_DATA

Here the json format of MultiWoz dataset is preprocessed for training in m 

In [10]:
if LOAD_TRAIN_DATA == False:
  i = 0
  k = 0
  LABELS = []
  TRAIN_DATA = []
  for key,val in DIALOGUE_ENG_DB.items(): #Diaologue number
    try:
      for ii in range(len(DIALOGUE_ENG_DB[key]['log'])): #replica number in a dialogue 
        text = translate(DIALOGUE_ENG_DB[key]['log'][ii]['text'], target='da')
        ents_list = []
        
        ents_origin_list = DIALOGUE_ENG_DB[key]['log'][ii]['dialog_act']['Hotel-Inform']
        for ient in range(len(ents_origin_list)): # entities in a replica
          label = DIALOGUE_ENG_DB[key]['log'][ii]['dialog_act']['Hotel-Inform'][ient][0]
          
          val = translate(DIALOGUE_ENG_DB[key]['log'][ii]['dialog_act']['Hotel-Inform'][ient][1], target='da')
          start_end_list = recognize_word(text, val)
          if start_end_list != []:
            for coinc in start_end_list:
              if words_overlap(coinc, list(zip(*list(zip(*ents_list))[0:2]))) == False:
                ents_list.append((coinc[0], coinc[1], label))
          LABELS.append(label)
        TRAIN_DATA.append( (text, {'entities':list(set(ents_list))}) )
        k += 1
    except:
      pass
    i += 1

  LABELS = list(set(LABELS))
  translated_data = {'label': LABELS, 'data':TRAIN_DATA}
  # save translated preprocessed data
  upload_json(DATA_DIR, TRAIN_FILENAME, translated_data)
  print("The number of hotel dialogues:{}, the number of replicas: {}\nLabels:{}".format(i,k, LABELS))
else:
  translated_data = load_json(DATA_DIR, TRAIN_FILENAME)
  LABELS = translated_data['label']
  TRAIN_DATA = translated_data['data']

AttributeError: ignored

In [None]:
TRAIN_DATA[0:7]

#2.2 Loadng a pretrained NER model/Creation of a new NER model

In [None]:
if USE_PRETRAINED_MODEL:
  model = 'da_core_news_lg'
else:
  model = None

output_dir=Path(NER_MODEL_DIR)

#load the model
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('da')  
    print("Created blank 'da' model")

#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

# Add new type of labels
for label in LABELS:
  if label not in ner.labels:
    ner.add_label(label)

In [None]:
print("Pipe line names: {},\nlabels: {}.".format(nlp.pipe_names, ner.labels))

In [None]:
def get_ents(nlp, text):
  docx = nlp(text)
  out = []
  for token in docx.ents:
      out.append((token.text, token.start_char, token.end_char,token.label_))
  return out

def print_ents(nlp, text):
  docx = nlp(text)
  for token in docx.ents:
      print("text:{}, start:{}, end:{}, label:{}".format(token.text,token.start_char, token.end_char,token.label_))

Here, we want to train the recognizer by disabling the unnecessary pipeline except for NER. The nlp_update function can be used to train the recognizer.

In [None]:
from spacy.training.example import Example

# for _, annotations in TRAIN_DATA:
#     for ent in annotations.get('entities'):
#         ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    if USE_PRETRAINED_MODEL:
      optimizer = nlp.create_optimizer()
    else:
      optimizer = nlp.begin_training()

    for itn in range(TRAIN_STEPS):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
          # create Example
          doc = nlp.make_doc(text)
          example = Example.from_dict(doc, annotations)
          nlp.update(
                [example],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)



In [None]:
#test
for text, _ in TRAIN_DATA[:20]:
    doc = nlp(text)
    print(text, 'Entities', [(ent.text, ent.label_) for ent in doc.ents])

Finally, save the model to your path which stored in the output_dir variable.

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
# get_ents(nlp, "Jeg elsker Paris, men jeg kan ikke lide Frankfurt. 5. januar skal jeg besøge. december 8skal jeg besøge")
ents = get_ents(nlp1, "Gonville hotel is in the expensive price range. Entities ")

def extract_date(ents):
  dates = []
  for ent in ents:
    parsed = dateparser.parse(ent[0])
    if parsed:
      dates.append(parsed)
  return sorted(dates)

def extract_date(ents):
  dates = []
  for ent in ents:
    parsed = dateparser.parse(ent[0])
    if parsed:
      dates.append(parsed)
  return sorted(dates)

extract_date(ents), ents

In [None]:
ents

In [None]:
# ner = nlp1.get_pipe('ner')

In [None]:
ner = nlp.get_pipe('ner')
ner.labels, nlp.pipe_names

In [None]:
# print_ents(nlp, 'Person 1: I need to book a hotel in the east that has 4 stars.')
print_ents(nlp, 'Person 1: Washington I need to book a hotel in the east that has 4 stars. I am planing to go 11.12.21')

In [None]:
print_ents(nlp1, 'I am going to go in May 2 and May 5')