In [1]:
import numpy as np
import pandas as pd
import random
import torch

from poprogress import simple_progress as simp
from tqdm import tqdm
from collections import Counter
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence



In [2]:
# train_data = pd.read_csv("train.csv")
# val_data = pd.read_csv("val.csv")
# test1_data = pd.read_csv("test1.csv")
# test2_data = pd.read_csv("test2.csv")

# train_len = len(train_data)
# val_len = len(val_data)
# test1_len = len(test1_data)
# test2_len = len(test2_data)

# print("train_len: ",train_len)
# print("val_len: ",val_len)
# print("test1_len: ",test1_len)
# print("test2_len: ",test2_len)
# train_data.head(5)

In [3]:
all_data = pd.read_csv("all-data.csv")
all_len = len(all_data)
print("all_len: ",all_len)
all_data.head(5)

all_len:  21363


Unnamed: 0,raw_sentence,labels
0,EU rejects German call to boycott British lamb .,"['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MIS..."
1,Peter Blackburn,"['B-PER', 'I-PER']"
2,BRUSSELS 1996-08-22,"['B-LOC', 'O']"
3,The European Commission said on Thursday it di...,"['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O..."
4,Germany 's representative to the European Unio...,"['B-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG'..."


In [4]:
def split_dataset(data, train_ratio, valid_ratio):

    pool = np.random.rand(len(data)) 
    mask1 = pool < train_ratio
    offset = train_ratio + valid_ratio
    mask2 = (pool >= train_ratio) * (pool < offset)
    train = data[mask1].reset_index(drop=True)
    valid = data[mask2].reset_index(drop=True)
    test = data[~(mask1 + mask2)].reset_index(drop=True)
    
    return train, valid, test

In [5]:
train_data, valid_data, test_data = split_dataset(all_data, 0.7, 0.15)
print("train_data_size: ",len(train_data))
print("valid_data_size: ",len(valid_data))
print("test_data_size: ",len(test_data))

train_data_size:  14977
valid_data_size:  3228
test_data_size:  3158


In [6]:
def get_label_unique(data):
    unique_label_list = []
    for label in simp(data["labels"]):
        labels =  label.replace('[','').replace(']','').split(',')
        for x in labels:
            tag = x.replace("'",'').replace(' ','')
            if tag not in unique_label_list:
                unique_label_list.append(tag)
    return unique_label_list

label_unique = sorted(get_label_unique(train_data))

label_to_id = {k: v for v,k in enumerate(label_unique)}
id_to_label = {k: v for k,v in enumerate(label_unique)}
print(label_to_id)
print(id_to_label)

100%|██████████| 14977/14977 [00:00<00:00, 26261.90it/s]

{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}





In [7]:
def get_tokens_labels(data, id):
    
    def get_sent_labels_list(data, id):
        labels_list = []
        label = data.loc[id, "labels"]
        labels =  label.replace('[','').replace(']','').split(',')
        for x in labels:
            tag = x.replace("'",'').replace(' ','')
            labels_list.append(tag)
        return labels_list
    
    def get_sent_tokens_list(data, id):
        tokens_list = []
        tokens = data.loc[id, "raw_sentence"].split()
        for token in tokens:
            tokens_list.append(token.lower())
        return tokens_list

    tokens_list = get_sent_tokens_list(data, id)
    labels_list = get_sent_labels_list(data, id)
    return tokens_list, labels_list

# get_tokens_labels(all_data, 0)

In [8]:
def get_data_seq(data):
    data_token_seq, data_label_seq = [], []
    for i in range(len(data)):
        a, b = get_tokens_labels(data, i)
        data_token_seq.append(a)
        data_label_seq.append(b)
    return data_token_seq, data_label_seq

train_token_seq, train_label_seq = get_data_seq(train_data)

token2cnt = Counter([token for sentence in train_token_seq for token in sentence])
label_set = sorted(set(label for sentence in train_label_seq for label in sentence))


In [9]:
def get_token2id(token2cnt, min_count = 1,add_pad = True, add_unk = True):
    '''
    Get mapping from tokens to indices to use with Embedding layer.
    
    param:
        - min_count : Do not mark number if number of words less then this value.
    '''
    token_to_id = {}

    if add_pad:
        token_to_id["<PAD>"] = len(token_to_id)
    if add_unk:
        token_to_id["<UNK>"] = len(token_to_id)

    for token, cnt in token2cnt.items():
        if cnt >= min_count:
            token_to_id[token] = len(token_to_id)

    return token_to_id
token_to_id = get_token2id(token2cnt)

In [10]:
# def nerDataset
def process_tokens(tokens, token2id, unk: str = "<UNK>"):
    return [token2id.get(token, token2id[unk]) for token in tokens]

def process_labels(labels,label2id):
    return [label2id[label] for label in labels]

class nerDataset(Dataset):

    def __init__(self, token_seq, label_seq, token2id, label2id, preprocess:bool = True):
        self.token2id = token2id
        self.label2id = label2id
        self.preprocess = preprocess
        
        if preprocess:
            self.token_seq = [process_tokens(tokens, token2id) for tokens in token_seq]
            self.label_seq = [process_labels(labels, label2id) for labels in label_seq]
        else:
            self.token_seq = token_seq 
            self.label_seq = label_seq  

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(self, id):
        if self.preprocess:
            tokens = self.token_seq[id]
            labels = self.label_seq[id]
        else:
            tokens = process_tokens(self.token_seq[id], self.token2id) 
            labels = process_labels(self.label_seq[id], self.label2id) 

        lengths = [len(tokens)]

        return np.array(tokens), np.array(labels), np.array(lengths)

In [11]:
train_set = nerDataset(train_token_seq, train_label_seq, token_to_id, label_to_id, preprocess=True)

valid_token_seq, valid_label_seq = get_data_seq(valid_data)
valid_set = nerDataset(valid_token_seq, valid_label_seq, token_to_id, label_to_id, preprocess=True)

test_token_seq, test_label_seq = get_data_seq(test_data)
test_set = nerDataset(test_token_seq, test_label_seq, token_to_id, label_to_id, preprocess=True)

In [12]:
class nerCollator:

    def __init__(self, token_padding_value, label_padding_value, percentile = 100):
        self.token_padding_value = token_padding_value
        self.label_padding_value = label_padding_value
        self.percentile = percentile

    def __call__(self, batch):

        tokens, labels, lengths = zip(*batch)

        tokens = [list(i) for i in tokens]
        labels = [list(i) for i in labels]
        # 避免句子过长, 应该给个固定长度，暂时不给
        max_len = int(np.percentile(lengths, self.percentile))

        lengths = torch.tensor(np.clip(lengths, a_min=0, a_max=max_len), dtype=torch.long).squeeze(-1)

        for i in range(len(batch)):
            tokens[i] = torch.tensor(tokens[i][:max_len], dtype=torch.long)
            labels[i] = torch.tensor(labels[i][:max_len], dtype=torch.long)

        sorted_idx = torch.argsort(lengths, descending=True)
        # 打补丁
        tokens = pad_sequence(tokens, padding_value=self.token_padding_value, batch_first=True)[sorted_idx]
        labels = pad_sequence(labels, padding_value=self.label_padding_value, batch_first=True)[sorted_idx]
        lengths = lengths[sorted_idx]

        return tokens, labels, lengths

In [44]:
train_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)

train_loader = DataLoader(
    dataset=train_set,
    batch_size=256,
    shuffle=False,
    collate_fn=train_coll_fn,
)

In [15]:
class Embedding(torch.nn.Module):

    def __init__(self, num_embeddings, embedding_dim):
        super(Embedding, self).__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [16]:
embedding_layer = Embedding(num_embeddings=len(token_to_id), embedding_dim=128)

In [17]:
class DynamicRNN(torch.nn.Module):

    def __init__(
        self,
        rnn_unit: torch.nn.Module,
        input_size: int,
        hidden_size: int,
        num_layers: int,
        dropout: float,
        bidirectional: bool,
    ):
        super(DynamicRNN, self).__init__()
        self.rnn = rnn_unit(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=bidirectional,
            batch_first=True,
        )

    def forward(self, x, x_length):
        packed_x = pack_padded_sequence(x, x_length.cpu(), batch_first=True, enforce_sorted=True)
        packed_rnn_out, _ = self.rnn(packed_x)
        rnn_out, _ = pad_packed_sequence(packed_rnn_out, batch_first=True)
        return rnn_out

In [18]:
rnn_layer = DynamicRNN(
    rnn_unit=torch.nn.LSTM,
    input_size=128,  # ref to emb_dim
    hidden_size=256,
    num_layers=1,
    dropout=0,
    bidirectional=True,
)

In [19]:
class LinearHead(torch.nn.Module):
    """
    Linear layer wrapper.
    """

    def __init__(self, linear_head):
        super(LinearHead, self).__init__()
        self.linear_head = linear_head

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear_head(x)

In [20]:
linear_head = LinearHead(
    linear_head=torch.nn.Linear(
        in_features=(2*256),
        out_features=len(label_to_id)))

In [21]:
class BiLSTM(torch.nn.Module):

    def __init__(self, embedding_layer, rnn_layer, linear_head):
        super(BiLSTM, self).__init__()
        self.embedding = embedding_layer  
        self.rnn = rnn_layer 
        self.linear_head = linear_head  

    def forward(self, x, x_length):
        embed = self.embedding(x) 
        rnn_out = self.rnn(embed, x_length) 
        logits = self.linear_head(rnn_out)  
        return logits

In [22]:
model = BiLSTM(
    embedding_layer=embedding_layer,
    rnn_layer=rnn_layer,
    linear_head=linear_head,
)#.to(device)

In [23]:
criterion = torch.nn.CrossEntropyLoss(reduction="none")  # hardcoded

optimizer_type = torch.optim.Adam
optimizer = optimizer_type(params=model.parameters(), lr=0.001, amsgrad=False)

# 训练

In [24]:
def masking(lengths: torch.Tensor) -> torch.Tensor:
    return torch.arange(end=lengths.max(), device=lengths.device).expand(size=(lengths.shape[0], lengths.max())) < lengths.unsqueeze(1)

In [25]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [26]:
def to_numpy(tensor: torch.Tensor) -> np.ndarray:
    """
    Convert torch.Tensor to np.ndarray.
    """
    return (tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy())


In [27]:
from sklearn.metrics import f1_score

def calculate_metrics(
    metrics,
    loss: float,
    y_true: np.ndarray,
    y_pred: np.ndarray,
    idx2label,
):# -> DefaultDict[str, List[float]]:
    """
    Calculate metrics on epoch.
    """

    metrics["loss"].append(loss)

    f1_per_class = f1_score(y_true=y_true, y_pred=y_pred, labels=range(len(idx2label)), average=None)
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average="weighted")
    for cls, f1 in enumerate(f1_per_class):
        metrics[f"f1 {idx2label[cls]}"].append(f1)
    metrics["f1-weighted"].append(f1_weighted)

    return metrics

In [28]:
from collections import defaultdict

In [45]:
def train_epoch(
    model,
    dataloader: DataLoader,
    criterion,
    optimizer,
    device: torch.device,
    clip_grad_norm: float,
    verbose: bool = True,
):
    """
    Training loop on one epoch.
    """

    metrics = defaultdict(list)
    # idx2label = {v: k for k, v in dataloader.dataset.label2idx.items()}

    if verbose:
        dataloader = tqdm(dataloader)

    model.train()

    for tokens, labels, lengths in simp(dataloader):
        tokens, labels, lengths = (
            tokens.to(device),
            labels.to(device),
            lengths.to(device),
        )

        mask = masking(lengths)

        # forward pass
        logits = model(tokens, lengths)
        loss_without_reduction = criterion(logits.transpose(-1, -2), labels)
        loss = torch.sum(loss_without_reduction * mask) / torch.sum(mask)

        # backward pass
        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(),
            max_norm=clip_grad_norm,
            norm_type=2,
        )

        optimizer.step()
        optimizer.zero_grad()

        # make predictions
        y_true = to_numpy(labels[mask])
        y_pred = to_numpy(logits.argmax(dim=-1)[mask])

        # calculate metrics
        metrics = calculate_metrics(
            metrics=metrics,
            loss=loss.item(),
            y_true=y_true,
            y_pred=y_pred,
            idx2label=id_to_label,
        )

    return metrics

In [30]:
def validate_epoch(
    model,
    dataloader: DataLoader,
    criterion,
    device: torch.device,
    verbose: bool = True,
):# -> DefaultDict[str, List[float]]:
    """
    Validate loop on one epoch.
    """

    metrics = defaultdict(list)
    # idx2label = {v: k for k, v in dataloader.dataset.label2idx.items()}

    if verbose:
        dataloader = tqdm(dataloader)

    model.eval()

    for tokens, labels, lengths in dataloader:
        tokens, labels, lengths = (
            tokens.to(device),
            labels.to(device),
            lengths.to(device),
        )

        mask = masking(lengths)

        # forward pass
        with torch.no_grad():
            logits = model(tokens, lengths)
            loss_without_reduction = criterion(logits.transpose(-1, -2), labels)
            loss = torch.sum(loss_without_reduction * mask) / torch.sum(mask)

        # make predictions
        y_true = to_numpy(labels[mask])
        y_pred = to_numpy(logits.argmax(dim=-1)[mask])

        # calculate metrics
        metrics = calculate_metrics(
            metrics=metrics,
            loss=loss.item(),
            y_true=y_true,
            y_pred=y_pred,
            idx2label=id_to_label,
        )

    return metrics

In [31]:
def train_loop(
    model,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    criterion,
    optimizer,
    device: torch.device,
    clip_grad_norm,
    n_epoch,
    tensorboard,
    logger,
    verbose,
    test_loader):
    
    for epoch in tqdm(range(n_epoch)):

        if verbose:
            logger.info(f"epoch [{epoch+1}/{n_epoch}]\n")

        train_metrics = train_epoch(
            model=model,
            dataloader=train_loader,
            criterion=criterion,
            optimizer=optimizer,
            device=device,
            clip_grad_norm=clip_grad_norm,
            verbose=verbose,
        )

        if verbose:
            for metric_name, metric_list in train_metrics.items():
                logger.info(f"train {metric_name}: {np.mean(metric_list)}")
            logger.info("\n")

        valid_metrics = validate_epoch(
            model=model,
            dataloader=valid_loader,
            criterion=criterion,
            device=device,
            verbose=verbose,
        )

        if verbose:
            for metric_name, metric_list in valid_metrics.items():
                logger.info(f"valid {metric_name}: {np.mean(metric_list)}")
            logger.info("\n")

    if test_loader is not None:
        # if tensorboard:
        #     writer.add_scalar("Loss/train", np.mean((train_metrics["loss"])), epoch)
        #     writer.add_scalar("Loss/val", np.mean((valid_metrics["loss"])), epoch)

        test_metrics = validate_epoch(
            model=model,
            dataloader=test_loader,
            criterion=criterion,
            device=device,
            verbose=verbose,
        )

        if verbose:
            for metric_name, metric_list in test_metrics.items():
                logger.info(f"test {metric_name}: {np.mean(metric_list)}")
            logger.info("\n")

In [32]:
valid_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)

valid_loader = DataLoader(
    dataset=valid_set,
    batch_size=256,
    shuffle=False,
    collate_fn=valid_coll_fn,
)
test_coll_fn = nerCollator(token_to_id["<UNK>"], label_to_id["O"], 100)

test_loader = DataLoader(
    dataset=test_set,
    batch_size=256,
    shuffle=False,
    collate_fn=test_coll_fn,
)

In [441]:
train_loop(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        test_loader=test_loader,
        criterion=criterion,
        optimizer=optimizer,
        device=device,
        clip_grad_norm=0.1,
        n_epoch=2, 
        logger=False,
        verbose=False,
        tensorboard=False
    )



  0%|          | 0/2 [00:00<?, ?it/s]


AttributeError: 'bool' object has no attribute 'info'

In [43]:
verbose = True
n_epoch = 1
clip_grad_norm = 0.1
for epoch in range(n_epoch):

    metrics = defaultdict(list)
    model.train()

    for tokens, labels, lengths in tqdm(train_loader):
        tokens, labels, lengths = (
            tokens.to(device),
            labels.to(device),
            lengths.to(device),
        )

        mask = masking(lengths)

        # forward pass
        logits = model(tokens, lengths)
        loss_without_reduction = criterion(logits.transpose(-1, -2), labels)
        loss = torch.sum(loss_without_reduction * mask) / torch.sum(mask)

        # backward pass
        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            model.parameters(),
            max_norm=clip_grad_norm,
            norm_type=2,
        )

        optimizer.step()
        optimizer.zero_grad()

        # make predictions
        y_true = to_numpy(labels[mask])
        y_pred = to_numpy(logits.argmax(dim=-1)[mask])

        # calculate metrics
        metrics = calculate_metrics(
            metrics=metrics,
            loss=loss.item(),
            y_true=y_true,
            y_pred=y_pred,
            idx2label=id_to_label,
        )
        
    # train_metrics = train_epoch(
    #     model=model,
    #     dataloader=train_loader,
    #     criterion=criterion,
    #     optimizer=optimizer,
    #     device=device,
    #     clip_grad_norm=0.1,
    #     verbose=verbose,
    # )

    # valid_metrics = validate_epoch(
    #     model=model,
    #     dataloader=valid_loader,
    #     criterion=criterion,
    #     device=device,
    #     verbose=verbose,
    # )

# if test_loader is not None:
#     # if tensorboard:
#     #     writer.add_scalar("Loss/train", np.mean((train_metrics["loss"])), epoch)
#     #     writer.add_scalar("Loss/val", np.mean((valid_metrics["loss"])), epoch)

#     test_metrics = validate_epoch(
#         model=model,
#         dataloader=test_loader,
#         criterion=criterion,
#         device=device,
#         verbose=verbose,
#     )


  0%|          | 0/59 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/spawn.py", line 120, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/spawn.py", line 130, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'nerDataset' on <module '__main__' (built-in)>
  0%|          | 0/59 [00:18<?, ?it/s]


KeyboardInterrupt: 