# Named Entity Recognition: RNN vs Pre-Trained Transformer

## Setup

In [58]:
# Data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# GloVe
import gensim.downloader

# DL
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Hugging Face
from datasets import load_dataset
from datasets import Dataset

## 1 RNN

### 1.1 Data Exploration & Analysis

In [None]:
glove_embeddings = gensim.downloader.load("glove-wiki-gigaword-300")

In [None]:
len(glove_embeddings), len(glove_embeddings[0])

(400000, 300)

In [None]:
conll2003_dataset = load_dataset("conll2003")

Downloading builder script: 100%|██████████| 9.57k/9.57k [00:00<00:00, 9.56MB/s]
Downloading metadata: 100%|██████████| 3.73k/3.73k [00:00<00:00, 1.80MB/s]
Downloading readme: 100%|██████████| 12.3k/12.3k [00:00<00:00, 6.17MB/s]


Downloading and preparing dataset conll2003/conll2003 to C:/Users/markk/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data: 100%|██████████| 983k/983k [00:00<00:00, 2.29MB/s]
                                                                                         

Dataset conll2003 downloaded and prepared to C:/Users/markk/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 70.49it/s]


In [None]:
[dataset.num_rows for dataset in conll2003_dataset.values()]

[14041, 3250, 3453]

In [None]:
conll2003_dataset['train'].features

9
9


In [None]:
tag2id = {tag: i for i, tag in enumerate(conll2003_dataset['train'].features['ner_tags'].feature.names)}
id2tag = {i: tag for i, tag in enumerate(conll2003_dataset['train'].features['ner_tags'].feature.names)}

tag2id, id2tag

({'O': 0,
  'B-PER': 1,
  'I-PER': 2,
  'B-ORG': 3,
  'I-ORG': 4,
  'B-LOC': 5,
  'I-LOC': 6,
  'B-MISC': 7,
  'I-MISC': 8},
 {0: 'O',
  1: 'B-PER',
  2: 'I-PER',
  3: 'B-ORG',
  4: 'I-ORG',
  5: 'B-LOC',
  6: 'I-LOC',
  7: 'B-MISC',
  8: 'I-MISC'})

In [None]:
all_tokens = []

for sequence in conll2003_dataset['train']:
    all_tokens = list(set(all_tokens + sequence['tokens']))
    
print(len(all_tokens))

all_tokens = []

for sequence in conll2003_dataset['validation']:
    all_tokens = list(set(all_tokens + sequence['tokens']))
    
print(len(all_tokens))

In [45]:
max_len = 500

def get_keys_from_DataSet(DS_NAME, keys = ['tokens','ner_tags']):
     return [conll2003_dataset[DS_NAME][:max_len].get(key) for key in keys]
 
# Split data
train_data, train_label =get_keys_from_DataSet('train')
val_data, val_label= get_keys_from_DataSet('validation')
test_data, test_label  = get_keys_from_DataSet('test')

### 1.2 Dataset & Dataloader

In [51]:
train_data
vocab = []
for line in train_data:
  vocab += line
vocab+= ['<unk>']

# Create dictionaries to convert between tokens and indices
token_to_index = {tok: i for i, tok in enumerate(set(vocab))}
index_to_token = {i: tok for i, tok in enumerate(set(vocab))}

In [63]:
def ToDataset(data, label):
    return Dataset.from_dict({
    "input": data,
    "labels":  label
})
    
train_dataset = ToDataset(train_data, train_label)
val_dataset =ToDataset(val_data, val_label)
test_dataset =ToDataset(test_data, test_label)

In [66]:
def ToDataloader(Dataset, batch_size = 2, shuffle = False, num_workers = 4):
    return DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=num_workers)


# Create a DataLoader object
train_dataloader = ToDataloader(train_dataset)
val_dataloader = ToDataloader(val_dataset)
test_dataloader = ToDataloader(test_dataset)

### 1.3 Model 

#### 1.3.1 Definition

#### 1.3.2 Experiments

#### 1.3.3 Evaluation

## 2 Pre-Trained Transformer

### 2.1 Dataset & Dataloader

### 2.2 Fine-Tuning

#### 2.3 Comparison to RNN

#### 2.3.1 Setups

#### 2.3.2 Approaches

#### 2.3.3 Error Analysis

## 3 Conclusions