In [1]:
# install huggingface datasets and pytorch-crf
! pip install datasets
! pip install pytorch-crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
# Drive mount
from google.colab import drive
drive.mount('/content/drive')

# import pytorch, numpy libraries
import torch 
import torch.nn as nn 
import torch.optim as optim 
import numpy as np

# import huggingface's datasets libraries
from datasets import load_dataset

# import Vectors from torchtext libraries 
from torchtext.vocab import Vectors

# import copy libraries for deepcopy
import copy

# import pytorch crf
from torchcrf import CRF

Mounted at /content/drive


In [3]:
##### 개체명 인식 데이터(conll2003) 로드 및 데이터 구조 훑어보기
data = load_dataset("conll2003")

print("Data type: ", type(data))    # 데이터 타입 확인
print("Data structure: ", data)     # 데이터 구조 확인
print("Data keys: ", data.keys())   # 데이터 키 확인

print("Data 0: ", data['train'][0])   # 실제 데이터 확인
print("Data class label: ", data['train'].features['ner_tags'].feature)    # 데이터 레이블 확인 
num_labels = data['train'].features['ner_tags'].feature.num_classes   # num_labels 변수에 처리할 레이블 개수 저장 
print(num_labels)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Data type:  <class 'datasets.dataset_dict.DatasetDict'>
Data structure:  DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Data keys:  dict_keys(['train', 'validation', 'test'])
Data 0:  {'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
Data class label:  ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)
9


In [4]:
##### 데이터 세트 만들기
train_data = data['train']
dev_data = data['validation']
test_data = data['test']

# 데이터에서 가장 긴 길이를 maximum sequence length로 지정
max_seq_len =  max([len(sample['tokens']) for sample in train_data])
print("max_seq_len: ", max_seq_len)

# 레이블 인덱스를 문자열 레이블로 바꾸는 딕셔너리 선언 (Ex: 3 --> B-ORG / 0 --> O / 1 --> B-PER)
label_itos = {}
for i in range(num_labels):
  label_itos[i] = data['train'].features['ner_tags'].feature.int2str(i)
print(label_itos)

max_seq_len:  113
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [9]:
### 사전학습 된 Glove 임베딩 불러오기 
embedding_model = Vectors(cache="/content/drive/MyDrive/실용자연어처리/", name="glove.6B.300d.txt")

# 앞으로 사용할 임베딩 정보 변수에 저장해놓기
emb_words = copy.deepcopy(list(embedding_model.stoi))
emb_vectors_origin = copy.deepcopy(embedding_model.vectors)
emb_dim = embedding_model.dim
emb_stoi = {}

# 임베딩 정보 출력
print("Is <pad> token in words?: ", "<pad>" in emb_words)  # 임베딩에 pad token이 있는지 확인 
print("Is <unk> token in words?: ","<unk>" in emb_words)  # 임베딩에 unk token이 있는지 확인 
print("emb_words(0~10): ", emb_words[0:10])     # 임베딩 단어의 0번부터 10번 단어 출력  
print("emb_dim: ", emb_dim)   # 임베딩 dimension 확인 

100%|█████████▉| 399999/400000 [00:46<00:00, 8574.63it/s]


Is <pad> token in words?:  False
Is <unk> token in words?:  False
emb_words(0~10):  ['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]
emb_dim:  300


In [10]:
### 룩업테이블 정의
# pad 토큰, unk 토큰 추가: <pad> -> 0, <unk> -> 1
emb_stoi['<pad>'] = 0
emb_stoi['<unk>'] = 1

print(len(emb_vectors_origin))    # Glove 임베딩에서 불러온 원래 임베딩에 포함된 단어 수 출력
pad_vector = torch.zeros(1, emb_dim)
unk_vector = torch.zeros(1, emb_dim)
emb_vectors = torch.cat((pad_vector, unk_vector, emb_vectors_origin), axis=0)
print(len(emb_vectors))   # Glove 임베딩에서 불러온 원래 임베딩에 포함된 단어 수 + pad 토큰, unk 토큰 더한 개수 출력

# emb_stoi: 단어가 key, 단어의 index가 value인 dictionary 생성
for i, word in enumerate(emb_words):
  emb_stoi[word] = i + 2

400000
400002


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
### 인덱스 변환 
def text_to_index(input_data, stoi, max_seq_len):
  all_index = []

  for sample in input_data:
    index_list = []
    for word in sample['tokens']:
      word = word.lower()
      if word in stoi.keys(): # 단어가 lookup table에 검색 가능한 경우
        index_list.append(stoi[word])
      else: # 단어가 lookup table에 검색 불가능한 경우
        index_list.append(stoi['<unk>'])

    # Padding: 샘플의 길이가 고정 크기의 최대 길이 (L)보다 작은 경우, pad 인덱스를 추가
    if max_seq_len > len(index_list):
      index_list = index_list + [stoi['<pad>']] * (max_seq_len - len(index_list))
    else: # 샘플의 길이가 고정 크기의 최대 길이 (L)보다 큰 경우, 고정 크기까지만 처리
      index_list = index_list[:max_seq_len]
    all_index.append(index_list)

  return all_index

# train, dev, test data들을 index로 변환
train_index = text_to_index(train_data, emb_stoi, max_seq_len)
dev_index = text_to_index(dev_data, emb_stoi, max_seq_len)
test_index = text_to_index(test_data, emb_stoi, max_seq_len)

print(train_index[0])
print(train_index[-100])

[646, 7580, 516, 582, 6, 5262, 299, 10240, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2512, 962, 616, 11880, 8841, 25, 7031, 4108, 26, 8, 9, 293, 18585, 1735, 552, 15, 187, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
### label에 padding을 적용
def pad_labels(original_data, max_seq_len, label_pad_index):
  padded_labels = []
  
  for original_sample in original_data:
    original_label = original_sample['ner_tags']
    # Padding: 샘플의 레이블 길이가 고정 크기의 최대 길이 (L)보다 작은 경우, pad 인덱스를 추가
    if max_seq_len > len(original_label):
      new_label = original_label + [label_pad_index] * (max_seq_len - len(original_label))
    else: # 샘플의 레이블 길이가 고정 크기의 최대 길이 (L)보다 큰 경우, 고정 크기까지만 처리
      new_label = original_label[:max_seq_len]
    padded_labels.append(new_label)

  return torch.tensor(padded_labels)

# label 인덱스의 pad index는 전체 레이블 중 마지막 번호
label_pad_idx = num_labels
label_itos[label_pad_idx] = "<PAD>"

# train, dev, test data의 레이블들을 pad index가 들어간 형태로 변환
train_labels = pad_labels(train_data, max_seq_len, label_pad_idx )
dev_labels = pad_labels(dev_data, max_seq_len, label_pad_idx )
test_labels = pad_labels(test_data, max_seq_len, label_pad_idx )

# 레이블에 pad index가 잘 입력되었는지 출력 확인
print(train_labels[0])
print(train_labels[-100])

tensor([3, 0, 7, 0, 0, 0, 7, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])
tensor([5, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])


In [14]:
### NER 평가
def evaluate_iob(predictions, labels, itos):
  true_positives = 0
  all_predictions = 0
  all_answers = 0

  for preds, targets in zip(predictions, labels):
    # Get predicted named entities
    preds_string = [itos[int(index)] for index in preds]
    pred_entities = get_entities(preds_string)

    # Get target(answer) named entities
    targets_string = [itos[int(index)] for index in targets]
    target_entities = get_entities(targets_string)
    
    # Count all predictions and all targetst(answers)
    all_predictions += len(pred_entities)
    all_answers += len(target_entities)
    
    # Calculate true positives
    for entity in pred_entities:
      if entity in target_entities:
        true_positives += 1
        target_entities.remove(entity)

  # Calculate precision, recall, and F1 score
  precision = true_positives / (all_predictions + 1e-10)
  recall = true_positives / (all_answers + 1e-10)
  f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

  return precision, recall, f1_score

def get_entities(seq):
    """Gets entities from sequence.

    Args:
        seq (list): sequence of labels.

    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).

    Example:
        >>> from seqeval.metrics.sequence_labeling import get_entities
        >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        >>> get_entities(seq)
        [('PER', 0, 1), ('LOC', 3, 3)]
    """
    # for nested list
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    for i, chunk in enumerate(seq + ['O']):
        tag = chunk[0]
        type_ = chunk.split('-')[-1]

        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i-1))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return chunks


def end_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk ended between the previous and current word.

    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.

    Returns:
        chunk_end: boolean.
    """
    chunk_end = False


    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    return chunk_end


def start_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk started between the previous and current word.

    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.

    Returns:
        chunk_start: boolean.
    """
    chunk_start = False

    if tag == 'B': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    return chunk_start

In [21]:
### NER 모델 만들기 

class LSTMNER(torch.nn.Module):
  def __init__(self, input_size, hidden_size, output_size, word_vectors, pad_word_id, pad_label_id):
    super(LSTMNER, self).__init__()
    self.pad_word_id = pad_word_id # word embedding에서의 padding index 
    self.pad_label_id = pad_label_id # label index에서의 padding index
    ### [Step 1] 레이어 선언 부분 구현
    # 1. 임베딩 레이어 선언
    self.word_embedding = nn.Embedding.from_pretrained(embeddings=word_vectors)
    # 2. BiLSTM 레이어 선언
    self.bilstm = nn.LSTM(input_size=emb_dim, hidden_size = hidden_size, bidirectional=True, batch_first=True) # 3. 출력 Linear layer 선언
    self.linear = nn.Linear(2*hidden_size, output_size)
    # 4. 출력 CRF layer 선언
    self.crf = CRF(output_size, batch_first = True)

  def get_lstm_outputs(self, x):
    # 1. 입력 인덱스 시퀀스를 단어 임베딩으로 만들기
    embedded_sents = self.word_embedding(x)
    # 2. 임베딩 결과를 BiLSTM에 입력 & 3. BiLSTM 시퀀스에서의 출력값을 사용 
    all_hidden_states, _ = self.bilstm(embedded_sents)
    # 4. hidden state를 풀고자 하는 문제(레이블 차원의 숫자)로 변환 
    outputs = self.linear(all_hidden_states)
    # 5. padded label을 위한 후처리
    pad_mask = (x == self.pad_word_id)
    outputs[:, :, self.pad_label_id] += pad_mask * 10000
    return outputs

  def predict(self, x):
# 1. BiLSTM 결과 값 구하기
    lstm_outputs = self.get_lstm_outputs(x) #2.CRFlayer의 알고리즘에 의한 최종 출력 결과 예측 
    predicted_res = self.crf.decode(lstm_outputs)
    return predicted_res

  def get_crf_loss(self, outputs, labels):
# 1. pytorch-crf를 사용한 crf loss 계산 
    loss = self.crf(outputs, labels) 
    return -loss

In [24]:
# randomness 제거
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# 하이퍼파라미터 셋팅
input_size = 300# To do
hidden_size = 100 # hidden size
output_size =  10
learning_rate = 0.001
batch_size = 128
num_epochs = 30

# BiLSTM NER 모델 초기화
model = LSTMNER(input_size, hidden_size, output_size, emb_vectors, emb_stoi['<pad>'], label_pad_idx)
device = torch.device("cuda") # use GPU
model = model.to(device)

# optimizer 정의
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# dev_index를 dev_tensors로 사용
dev_tensors = torch.cuda.LongTensor(dev_index) 

# best f1 score 초기값 셋팅
best_f1 = 0

# 학습 시작 (총 num_epochs 만큼)
for epoch in range(num_epochs):
  # model을 학습모드로 만든다.
  model.train()
  epoch_loss = 0

  # train_index를 train_tensors로 사용   
  train_tensors = torch.cuda.LongTensor(train_index)

  # batch size 단위로 학습 진행
  for i in range(0, len(train_tensors), batch_size):

    # batch 단위 데이터 생성
    batch_data = train_tensors[i:i+batch_size] # batch size 크기의 데이터가 batch_data
    batch_labels = torch.Tensor(train_labels[i:i+batch_size]).to(device)

    # 1. Bi-LSTM 순전파 
    outputs = model.get_lstm_outputs(batch_data)
    # 2. CRF 오차 계산
    loss = model.get_crf_loss(outputs, batch_labels)
    # 3. 역전파
    optimizer.zero_grad()
    # 4. 가중치 업데이트
    loss.backward()
    optimizer.step()

    # epoch loss
    epoch_loss = loss.item()

  # 매 epoch마다 dev 성능 측정
  # 모델을 평가하는 모드로 셋팅
  model.eval()
  with torch.no_grad():
    dev_preds = model.predict(dev_tensors) # dev 데이터의 BiLSTM-CRF 결과 출력 
    dev_p, dev_r, dev_f1 = evaluate_iob(dev_preds, dev_labels, label_itos)

    # save best model on dev data
    if dev_f1 > best_f1:
      best_model = model
      best_f1 = dev_f1

    print(f"Epoch {epoch+1}, F1-score: {dev_f1} , loss: {epoch_loss/len(train_tensors)}")

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Epoch 1, F1-score: 0.6415917842905073 , loss: 0.034096574318068514
Epoch 2, F1-score: 0.7875141883723324 , loss: 0.015570472188590556
Epoch 3, F1-score: 0.8406558875122492 , loss: 0.009899579802008404
Epoch 4, F1-score: 0.8699556602744503 , loss: 0.007264439854711203
Epoch 5, F1-score: 0.8839072147244498 , loss: 0.006124919877501602
Epoch 6, F1-score: 0.8962117855060544 , loss: 0.005350402392991952
Epoch 7, F1-score: 0.9043305346384686 , loss: 0.004718324905633502
Epoch 8, F1-score: 0.9104633863624728 , loss: 0.004304358663912827
Epoch 9, F1-score: 0.9144993676302069 , loss: 0.0038903924221921514
Epoch 10, F1-score: 0.916881266729895 , loss: 0.0035609999287800013
Epoch 11, F1-score: 0.916452933101423 , loss: 0.0034007549319849013
Epoch 12, F1-score: 0.9170705145157401 , loss: 0.002875507442489851
Epoch 13, F1-score: 0.9206661206161112 , loss: 0.002541663699166726
Epoch 14, F1-score: 0.9232367413387282 , loss: 0.002341357453172851
Epoch 15, F1-score: 0.9218613774565289 , loss: 0.0023191

In [25]:
##### 테스트 세트로 시스템 평가하기
final_model = best_model # 최고의 dev 성능을 가진 BiLSTM-CRF 모델
test_tensors = torch.cuda.LongTensor(test_index)
test_preds = model.predict(test_tensors) # 최종 모델로 test 데이터 예측
test_p, test_r, test_f1 = evaluate_iob(test_preds, test_labels, label_itos) # 정확도 측정
print("Test F1-score: {:.2f}%".format(test_f1*100)) # 정확도 출력

Test F1-score: 90.05%
