In [317]:
import numpy as np
import pandas as pd
import random
import torch

from poprogress import simple_progress as simp
from tqdm import tqdm
from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence



In [109]:
# train_data = pd.read_csv("train.csv")
# val_data = pd.read_csv("val.csv")
# test1_data = pd.read_csv("test1.csv")
# test2_data = pd.read_csv("test2.csv")

# train_len = len(train_data)
# val_len = len(val_data)
# test1_len = len(test1_data)
# test2_len = len(test2_data)

# print("train_len: ",train_len)
# print("val_len: ",val_len)
# print("test1_len: ",test1_len)
# print("test2_len: ",test2_len)
# train_data.head(5)

In [36]:
all_data = pd.read_csv("all-data.csv")
all_len = len(all_data)
print("all_len: ",all_len)
all_data.head(5)

all_len:  21363


Unnamed: 0,raw_sentence,labels
0,EU rejects German call to boycott British lamb .,"['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MIS..."
1,Peter Blackburn,"['B-PER', 'I-PER']"
2,BRUSSELS 1996-08-22,"['B-LOC', 'O']"
3,The European Commission said on Thursday it di...,"['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O..."
4,Germany 's representative to the European Unio...,"['B-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG'..."


In [180]:
def split_dataset(data, train_ratio, valid_ratio):

    pool = np.random.rand(len(data)) 
    mask1 = pool < train_ratio
    offset = train_ratio + valid_ratio
    mask2 = (pool >= train_ratio) * (pool < offset)
    train = data[mask1].reset_index(drop=True)
    valid = data[mask2].reset_index(drop=True)
    test = data[~(mask1 + mask2)].reset_index(drop=True)
    
    return train, valid, test

In [181]:
train_data, valid_data, test_data = split_dataset(all_data, 0.7, 0.15)
print("train_data_size: ",len(train_data))
print("valid_data_size: ",len(valid_data))
print("test_data_size: ",len(test_data))

train_data_size:  14941
valid_data_size:  3238
test_data_size:  3184


In [182]:
def get_label_unique(data):
    unique_label_list = []
    for label in simp(data["labels"]):
        labels =  label.replace('[','').replace(']','').split(',')
        for x in labels:
            tag = x.replace("'",'').replace(' ','')
            if tag not in unique_label_list:
                unique_label_list.append(tag)
    return unique_label_list

label_unique = sorted(get_label_unique(train_data))

label_to_id = {k: v for v,k in enumerate(label_unique)}
id_to_label = {k: v for k,v in enumerate(label_unique)}
print(label_to_id)
print(id_to_label)

  0%|          | 0/14941 [00:00<?, ?it/s]

100%|██████████| 14941/14941 [00:00<00:00, 63305.53it/s]

{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}





In [184]:
def get_tokens_labels(data, id):
    
    def get_sent_labels_list(data, id):
        labels_list = []
        label = data.loc[id, "labels"]
        labels =  label.replace('[','').replace(']','').split(',')
        for x in labels:
            tag = x.replace("'",'').replace(' ','')
            labels_list.append(tag)
        return labels_list
    
    def get_sent_tokens_list(data, id):
        tokens_list = []
        tokens = data.loc[id, "raw_sentence"].split()
        for token in tokens:
            tokens_list.append(token.lower())
        return tokens_list

    tokens_list = get_sent_tokens_list(data, id)
    labels_list = get_sent_labels_list(data, id)
    return tokens_list, labels_list

get_tokens_labels(all_data, 0)

(['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'])

In [204]:
def get_data_seq(data):
    data_token_seq, data_label_seq = [], []
    for i in range(len(data)):
        a, b = get_tokens_labels(data, i)
        data_token_seq.append(a)
        data_label_seq.append(b)
    return data_token_seq, data_label_seq

train_token_seq, train_label_seq = get_data_seq(train_data)

token2cnt = Counter([token for sentence in train_token_seq for token in sentence])
label_set = sorted(set(label for sentence in train_label_seq for label in sentence))


In [252]:
for x in train_token_seq:
    for i in x:
        print(i)
    break

for x in train_label_seq:
    for i in x:
        print(i)
    break

peter
blackburn
B-PER
I-PER


In [265]:
def get_token2id(token2cnt, min_count = 1,add_pad = True, add_unk = True):
    '''
    Get mapping from tokens to indices to use with Embedding layer.
    
    param:
        - min_count : Do not mark number if number of words less then this value.
    '''
    token_to_id = {}

    if add_pad:
        token_to_id["<PAD>"] = len(token_to_id)
    if add_unk:
        token_to_id["<UNK>"] = len(token_to_id)

    for token, cnt in token2cnt.items():
        if cnt >= min_count:
            token_to_id[token] = len(token_to_id)

    return token_to_id
token_to_id = get_token2id(token2cnt)

In [309]:
# def nerDataset
def process_tokens(tokens, token2id, unk: str = "<UNK>"):
    return [token2id.get(token, token2id[unk]) for token in tokens]

def process_labels(labels,label2id):
    return [label2id[label] for label in labels]

class nerDataset(Dataset):

    def __init__(self, token_seq, label_seq, token2id, label2id, preprocess:bool = True):
        self.token2id = token2id
        self.label2id = label2id
        self.preprocess = preprocess
        
        if preprocess:
            self.token_seq = [process_tokens(tokens, token2id) for tokens in token_seq]
            self.label_seq = [process_labels(labels, label2id) for labels in label_seq]
        else:
            self.token_seq = token_seq 
            self.label_seq = label_seq  

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(self, id):
        if self.preprocess:
            tokens = self.token_seq[id]
            labels = self.label_seq[id]
        else:
            tokens = process_tokens(self.token_seq[id], self.token2id) 
            labels = process_labels(self.label_seq[id], self.label2id) 

        lengths = [len(tokens)]

        return np.array(tokens), np.array(labels), np.array(lengths)

In [310]:
train_set = nerDataset(train_token_seq, train_label_seq, token_to_id, label_to_id, preprocess=True)

valid_token_seq, valid_label_seq = get_data_seq(valid_data)
valid_set = nerDataset(valid_token_seq, valid_label_seq, token_to_id, label_to_id, preprocess=True)

In [312]:
class nerCollator:

    def __init__(self, token_padding_value, label_padding_value, percentile = 100):
        self.token_padding_value = token_padding_value
        self.label_padding_value = label_padding_value
        self.percentile = percentile

    def __call__(self, batch):

        tokens, labels, lengths = zip(*batch)

        tokens = [list(i) for i in tokens]
        labels = [list(i) for i in labels]

        max_len = int(np.percentile(lengths, self.percentile))

        lengths = torch.tensor(
            np.clip(lengths, a_min=0, a_max=max_len),
            dtype=torch.long,
        ).squeeze(-1)

        for i in range(len(batch)):
            tokens[i] = torch.tensor(tokens[i][:max_len], dtype=torch.long)
            labels[i] = torch.tensor(labels[i][:max_len], dtype=torch.long)

        sorted_idx = torch.argsort(lengths, descending=True)
        print(tokens)
        print(self.token_padding_value)
        
        tokens = pad_sequence(
            tokens, padding_value=self.token_padding_value, batch_first=True
        )[sorted_idx]
        labels = pad_sequence(
            labels, padding_value=self.label_padding_value, batch_first=True
        )[sorted_idx]
        lengths = lengths[sorted_idx]

        return tokens, labels, lengths

In [319]:
train_coll_fn = nerCollator(0, label_to_id["O"], 100)

train_loader = DataLoader(
    dataset=train_set,
    batch_size=256,
    shuffle=False,
    collate_fn=train_coll_fn,
)

In [321]:
class Embedding(torch.nn.Module):

    def __init__(self, num_embeddings, embedding_dim):
        super(Embedding, self).__init__()

        self.embedding = torch.nn.Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embedding_dim,
        )

    def forward(self, x):
        return self.embedding(x)

In [323]:
embedding_layer = Embedding(
    num_embeddings=len(token_to_id),
    embedding_dim=128)

In [330]:
for a,b,c in train_loader:
    o = embedding_layer(a)
    print(o.shape)

torch.Size([256, 46, 128])
torch.Size([256, 40, 128])
torch.Size([256, 50, 128])
torch.Size([256, 48, 128])
torch.Size([256, 50, 128])
torch.Size([256, 51, 128])


torch.Size([256, 44, 128])
torch.Size([256, 51, 128])
torch.Size([256, 60, 128])
torch.Size([256, 44, 128])
torch.Size([256, 44, 128])
torch.Size([256, 57, 128])
torch.Size([256, 50, 128])
torch.Size([256, 44, 128])
torch.Size([256, 50, 128])
torch.Size([256, 38, 128])
torch.Size([256, 51, 128])
torch.Size([256, 50, 128])
torch.Size([256, 52, 128])
torch.Size([256, 55, 128])
torch.Size([256, 49, 128])
torch.Size([256, 52, 128])
torch.Size([256, 46, 128])
torch.Size([256, 47, 128])
torch.Size([256, 62, 128])
torch.Size([256, 50, 128])
torch.Size([256, 50, 128])
torch.Size([256, 47, 128])
torch.Size([256, 78, 128])
torch.Size([256, 55, 128])
torch.Size([256, 59, 128])
torch.Size([256, 49, 128])
torch.Size([256, 53, 128])
torch.Size([256, 55, 128])
torch.Size([256, 47, 128])
torch.Size([256, 113, 128])
torch.Size([256, 60, 128])
torch.Size([256, 48, 128])
torch.Size([256, 80, 128])
torch.Size([256, 44, 128])
torch.Size([256, 59, 128])
torch.Size([256, 60, 128])
torch.Size([256, 53, 128])


In [61]:
import torch

In [None]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, df):
        
        self.tokenized_text = []
        self.tags = []
        for i in simp(range(len(df))):
            sent = df.loc[i,"raw_sentence"]
            tags = get_sent_labels_list(i, df)
            
            temp = tokenizer(sent, padding='max_length', max_length=512,
                                truncation=True, return_tensors="pt")
            self.tokenized_text.append(temp)
            self.tags.append(tags)
            
    def __len__(self):
        return len(self.tags)
        
    def get_batch_text(self, id):
        return self.tokenized_text[id]
    
    def get_bach_tag(self, id):
        return self.tags[id]
        
    def __getitem__(self, id):
        batch_text = self.get_batch_text(id)
        batch_tag = self.get_bach_tag(id)
        return batch_text, batch_tag