In [7]:
from datasets import load_dataset

conll_dataset = load_dataset("conll2003")

In [8]:
GLOVE_FILEPATH = 'glove.6B/glove.6B.50d.txt'
EMBEDDING_DIM = 50

In [9]:
# Code to find max length.
def find_max(dataset,type='train'):
    lengths = []
    for i in range(len(dataset[type])):
        lengths.append(len(dataset[type][i]['tokens']))
    return max(lengths)

print(f'Max sequence length in train set: {find_max(conll_dataset,"train")}')
print(f'Max sequence length in validation set: {find_max(conll_dataset,"validation")}')
print(f'Max sequence length in test set: {find_max(conll_dataset,"test")}')

Max sequence length in train set: 113
Max sequence length in validation set: 109
Max sequence length in test set: 124


In [10]:
# Create a set of tokens
def create_set(dataset,type='train'):
    corpus = []
    for i in range(len(dataset[type])):
        corpus += dataset[type][i]['tokens']
    return set(corpus)

set_train = create_set(conll_dataset,'train')
set_test = create_set(conll_dataset,'test')
set_validation = create_set(conll_dataset,'validation')

setTokens = set.union(set_train,set_test,set_validation)

setTokens = set(map(lambda x: x.lower(),setTokens))

setTokens = list(setTokens)
print(f"Total vocab size: {len(setTokens)}")

Total vocab size: 26869


In [11]:
# Remove all the tokens not in glove embeddings

# Store all glove words.
glove_words = []
# Iterate through file storing glove words.
f = open(GLOVE_FILEPATH, encoding="utf-8")
for line in f:
  values = line.strip().split(' ')
  word = values[0]
  glove_words.append(word)
f.close()

# Iterate over setTokens and remove tokens not in glove.
for token in setTokens:
  if token not in glove_words:
    setTokens.remove(token)

# We assign index 0 for padding token and 1 for unknown token.
setTokens[0] = '<PAD>'
setTokens[1] = '<UNK>'

print(f'Vocab size after non-glove tokens are removed and unknown and padding token are added: {len(setTokens)}')

Vocab size after non-glove tokens are removed and unknown and padding token are added: 23432


In [13]:
# Create tokentoidx and idxtotoken dictionary 
token2idx = {}
for idx,word in enumerate(setTokens):
    token2idx.update({word:idx}) 

idx2token = {v:k for k,v in token2idx.items()}

len(token2idx), len(idx2token)

(23432, 23432)

# Create Embedding Matrix

In [2]:
import torch
import torch.nn as nn
import numpy as np
import pickle
import os

In [8]:
# Create Embedding matrix
embedding = nn.Embedding(len(token2idx),EMBEDDING_DIM)
for params in embedding.parameters():
    params.requires_grad = False

In [9]:
# Set padding and unknown embeddings.

# Set 0'th index as padding
embedding.weight[0] = torch.zeros((EMBEDDING_DIM))
# Set 1st index and unknown weight of glove
with open(GLOVE_FILEPATH) as f:
    for line in f:
        values = line.strip().split(' ')
unknown_vec = np.asarray(values[1:], dtype='float32')
unknown_vec = torch.from_numpy(unknown_vec)
embedding.weight[1] = unknown_vec

In [12]:
# Create final embedding matrix.
with open(GLOVE_FILEPATH) as f:
    count = 0
    for line in f:
        values = line.strip().split(' ')
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        vec = torch.from_numpy(vec)
        if word in token2idx:
            idx = token2idx[word]
            embedding.weight[idx] = vec
            count+=1

In [15]:
torch.save(embedding.state_dict(),f'conll-assets/conll.glove.6B.{EMBEDDING_DIM}d.pt')
# Save idxtotoken and tokentoidx
with open('conll-assets/token2idx.pkl', 'wb') as file:
    pickle.dump(token2idx, file)
with open('conll-assets/idx2token.pkl', 'wb') as file:
    pickle.dump(idx2token, file)