In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm, trange
import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.feature_extraction import DictVectorizer
from transformers import AutoTokenizer,AutoModelForTokenClassification, AutoConfig

In [2]:
input_data = pd.read_csv("NER_data_marefa.csv")
input_data = input_data[input_data['sentence_id'] <=8]
tags = ["O", "job", "nationality", "person", "location","time", "event", "organization","product", "artwork"]
label_map = { v:index for index, v in enumerate(sorted(tags))}
map_label = { index:v for index, v in enumerate(sorted(tags))}

max_len = 250
batch_size = 1

tokenizer = AutoTokenizer.from_pretrained("marefa-nlp/marefa-ner")
function = lambda s: [s['sentence'].unique()[0], [label_map[i] for i in s["tag"].values.tolist()]]
grouped = input_data.groupby("sentence_id").apply(function)
items = [s for s in grouped]
sents = [item[0] for item in items]
labels = [item[1] for item in items]
tokenized_texts = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent)) for sent in sents]
X = pad_sequences(tokenized_texts,maxlen=max_len, value=tokenizer.pad_token_id, padding="post",dtype="long", truncating="post")
Y = pad_sequences(labels,maxlen=max_len, value=label_map["O"], padding="post",dtype="long", truncating="post")
#Y = to_categorical(Y, num_classes = len(tags))
attention_masks = np.array([[float(i!=1) for i in ii] for ii in X])

In [3]:
X.shape , Y.shape , attention_masks.shape

((8, 250), (8, 250), (8, 250))

In [4]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y,test_size=0.1)
Mask_train, Mask_valid, _, _ = train_test_split(attention_masks, X,test_size=0.1)

In [5]:
X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_valid = torch.tensor(Y_valid, dtype=torch.long)
Mask_train = torch.tensor(Mask_train)
Mask_valid = torch.tensor(Mask_valid)

In [6]:
data_train = TensorDataset(X_train, Mask_train, Y_train)
data_train_sampler = RandomSampler(data_train)
DL_train = DataLoader(data_train, sampler=data_train_sampler, batch_size=batch_size)

data_valid = TensorDataset(X_valid, Mask_valid, Y_valid)
data_valid_sampler = SequentialSampler(data_valid)
DL_valid = DataLoader(data_valid, sampler=data_valid_sampler, batch_size=batch_size)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [8]:
class NER_model(nn.Module):
    def __init__(self,num_classes = 10):
        super(NER_model, self).__init__()
        
        config = AutoConfig.from_pretrained("marefa-nlp/marefa-ner")
        config.num_labels = num_classes
        self.bert = AutoModelForTokenClassification.from_config(config)
        ### New layers:
        self.linear = nn.Softmax(dim = -1)
        

    def forward(self, ids, mask):
        
        sequence_output = self.bert(input_ids=ids, attention_mask=mask)
        sequence_output = sequence_output.logits
        logits = self.linear(sequence_output)
        #y_hat = logits.argmax(-1)
        return logits#, y_hat

model = NER_model(num_classes = len(label_map))
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1000)

In [9]:
def flat_accuracy(preds, labels, mask):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat[mask] == labels_flat[mask]) / len(labels_flat[mask])

In [10]:
for epoch in range(1):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for i, batch in tqdm(enumerate(DL_train), total=len(DL_train)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        b_input_ids = b_input_ids.type(torch.LongTensor) 
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)

        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)
        
        optimizer.zero_grad()
        # forward pass
        logits = model(b_input_ids,mask=b_input_mask)
        
        active_loss = b_input_mask.view(-1) == 1
        active_logits = logits.view(-1, logits.shape[-1])[active_loss]
        active_labels = b_labels.view(-1)[active_loss]
        loss = criterion(active_logits, active_labels)
        
        #loss = criterion(logits.view(-1, logits.shape[-1]), b_labels.view(-1))
        loss.backward()
        
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        optimizer.step()
        scheduler.step(loss)
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in DL_valid:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        b_input_ids = b_input_ids.type(torch.LongTensor) 
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)

        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)
        active_loss=None
        with torch.no_grad():
            logits = model(b_input_ids,mask=b_input_mask)
            active_loss = b_input_mask.view(-1) == 1
            active_logits = logits.view(-1, logits.shape[-1])[active_loss]
            active_labels = b_labels.view(-1)[active_loss]
            tmp_eval_loss = criterion(active_logits, active_labels)
            #tmp_eval_loss = criterion(logits.view(-1, logits.shape[-1]), b_labels.view(-1))
        
        logits = logits.cpu().detach().numpy()
        label_ids = b_labels.cpu().detach().numpy()
        
        predictions.extend([list(p) for p in np.argmax(logits, axis=-1)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids, active_loss.cpu().detach().numpy())
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [[map_label[p_i] for p_i in p] for p in predictions]
    valid_tags = [[map_label[l_ii] for l_ii in l_i] for l in true_labels for l_i in l]
    precision = precision_score(valid_tags[0],pred_tags[0],average="macro")
    recall =  recall_score(valid_tags[0],pred_tags[0],average="macro")
    f1 = f1_score(valid_tags[0],pred_tags[0],average="macro")
    print("precision: %.3f recall: %.3f f1: %.3f" % (precision, recall, f1))

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:23<00:00, 11.87s/it]


Train loss: 1.7975263254983085
Validation loss: 1.6638537645339966
Validation Accuracy: 0.7972972972972973
precision: 0.235 recall: 0.250 f1: 0.242


  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
def postprocesspred(sent,tags,scores,grouped):
    if grouped:
        a = sent.split()
        b = tokenizer.tokenize(sent)
        i = -1
        j = 0
        d = [-1]*len(a)
        e = [0.0]*len(a)
        f = [0]*len(a)
        while True:
            if b[j][0] == '▁':
                i+=1
                d[i] = tags[j]
                e[i] += scores[j]
                f[i] += 1
            j+=1
            if -1 not in d and (j>=len(b) or i>=len(a)):
                break
        g = np.array(e)/np.array(f)
        array = []
        words = tokenizer.tokenize(sent)
        for l,w,s in zip(d,a,g):
            array.append([w,map_label[l],s])
        return array
        return d
    else:
        array = []
        words = tokenizer.tokenize(sent)
        for l,w,s in zip(tags,words,scores):
            array.append([w,map_label[l],s])
        return array

def predict(texts,grouped=False):
    if isinstance(texts,str):
        texts = [texts]
    array = []
    for text in texts:
        x = pad_sequences([tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))],maxlen=max_len, value=tokenizer.pad_token_id, padding="post",dtype="long", truncating="post")
        mask = np.array([[float(i!=1) for i in ii] for ii in x])
        with torch.no_grad():
            model.eval()
            model.to(device)
            predictions = model(torch.tensor(x, dtype=torch.long), mask=torch.tensor(mask, dtype=torch.long))
            predictions = predictions.cpu().numpy()
            mask = mask.astype('int32')
            scores = np.max(predictions,-1).reshape(predictions.shape[0]*predictions.shape[1],)[np.where(mask.reshape(mask.shape[0]*mask.shape[1],) == 1)]
            predictions = np.argmax(predictions,-1).reshape(predictions.shape[0]*predictions.shape[1],)[np.where(mask.reshape(mask.shape[0]*mask.shape[1],) == 1)]
            array.append(postprocesspred(text,predictions,scores,grouped))
    if len(array) == 1:
        return array[0]
    return array

In [34]:
a = predict("تتمتع مصر بعدد من المزارات السياحية")
print(a)

[['▁ت', 'O', 0.99999976], ['تمتع', 'O', 0.99999976], ['▁مصر', 'O', 0.99999976], ['▁بعد', 'O', 0.99999976], ['د', 'O', 0.99999976], ['▁من', 'O', 0.99999976], ['▁الم', 'O', 0.99999976], ['زار', 'O', 0.99999976], ['ات', 'O', 0.99999976], ['▁السياحية', 'O', 0.99999976]]


In [35]:
a = predict("تتمتع مصر بعدد من المزارات السياحية",True)
print(a)

[['تتمتع', 'O', 0.9999997615814209], ['مصر', 'O', 0.9999997615814209], ['بعدد', 'O', 0.9999997615814209], ['من', 'O', 0.9999997615814209], ['المزارات', 'O', 0.9999997615814209], ['السياحية', 'O', 0.9999997615814209]]
