<a href="https://colab.research.google.com/github/Salma-Jamal/Prepare-Dataset-For-Ner/blob/main/Tactful_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install seqeval

In [None]:
!wget https://cdn.discordapp.com/attachments/778630432878362676/880535945919742002/test.csv
!wget https://cdn.discordapp.com/attachments/778630432878362676/880535949161922610/train.csv

In [5]:
import pandas as pd
import ast
import re 
import string
import seaborn as sns
import numpy as np
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score
import torch.nn as nn



In [6]:
train_df  = pd.read_csv('/content/train.csv')

In [7]:
train_df.annotations = train_df.annotations.apply(lambda annotations : ast.literal_eval(annotations))

In [8]:
train_df

Unnamed: 0,text,annotations
0,اريد تحويل 70 دقيقه لرقم 01100000011 من 011000...,"[{'start': 25, 'end': 36, 'entity': 'phonenumb..."
1,طيب لو عايز احول من باقة الانترنت بتاعتى لرقم ...,"[{'start': 46, 'end': 57, 'entity': 'phonenumb..."
2,ممكن أعرف متبقيلي ميجابايتس قد ايه؟,"[{'start': 18, 'end': 27, 'entity': 'type'}]"
3,شو سعر دقيقة التجوال فى السعودية,"[{'start': 24, 'end': 32, 'entity': 'country'}..."
4,عاوز اضيف دقايق تجوال للرقم ده 01100000011,"[{'start': 31, 'end': 42, 'entity': 'phonenumb..."
...,...,...
238,خليهم من الرقم ده 01100000012,"[{'start': 18, 'end': 29, 'entity': 'phonenumb..."
239,اريد السفر الى المملكة العربية السعودية,"[{'start': 15, 'end': 39, 'entity': 'country'}]"
240,اريد السفر الى المملكة المتحدة,"[{'start': 15, 'end': 30, 'entity': 'country'}]"
241,سعر التجوال فى المملكة العربية السعودية,"[{'start': 15, 'end': 39, 'entity': 'country'}]"


In [9]:
def pre_process_train(sentence,annotations,sen_num):
  lst = {}
  keyorder = sentence.split()
  for i in annotations:
    sen = sentence[i['start']:i['end']].split()
    if len(sen) > 1:
      # for l in range(len(sen)):
      #   if l == 0:
      #     lst[sen[l]] = 'B-'+i['entity']
      #   elif l == len(sen)-1:
      #     lst[sen[l]] = 'E-'+i['entity']
      #   else:
      #     lst[sen[l]] ='I-'+i['entity']
      for word in sen:
        lst[word] = i['entity']
    else:
      lst["".join(sen)] = i['entity']

  for word in sentence.split():
    if word not in lst.keys() and word+' ' not in lst.keys():
      lst[word] = 'O'     
  lst = sorted(lst.items(), key=lambda i:keyorder.index(i[0]))
  lst = [('sent'+str(sen_num),) + i for i in lst]
  return lst

In [10]:
%%time
for i in range(len(train_df.text.to_list())):
  lst = pre_process_train(train_df.text.to_list()[i],train_df.annotations.to_list()[i],i+1)
  if i == 0:
    df_train = pd.DataFrame(lst, columns=["Sentence","Word", "Tag"])
  else:
    df2 = pd.DataFrame(lst, columns=["Sentence","Word", "Tag"])
    df_train= df_train.append(df2)

CPU times: user 347 ms, sys: 27.4 ms, total: 375 ms
Wall time: 358 ms


In [11]:
df_train.Tag.value_counts()

O              1027
type            196
phonenumber     106
country          50
amount           29
Name: Tag, dtype: int64

In [12]:
class SentenceGetter(object):

  def __init__(self, data):
    self.n_sent = 1
    self.data = df_train
    self.empty = False
    agg_func = lambda s: [(w,  t) for w, t in zip(s["Word"].values.tolist(),
                                                      s["Tag"].values.tolist())]
    self.grouped = self.data.groupby("Sentence").apply(agg_func)
    self.sentences = [s for s in self.grouped]

  def get_next(self):
    try:
        s = self.grouped["Sentence: {}".format(self.n_sent)]
        self.n_sent += 1
        return s
    except:
        return None

In [13]:
getter = SentenceGetter(df_train)

In [14]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

['اريد', 'تحويل', '70', 'دقيقه', 'لرقم', '01100000011', 'من', '01100000015']

In [15]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'phonenumber', 'O', 'phonenumber']


In [16]:
tag_values = list(set(df_train["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [17]:
tag2idx

{'O': 4, 'PAD': 5, 'amount': 0, 'country': 1, 'phonenumber': 3, 'type': 2}

In [18]:
MAX_LEN = 30
bs = 16

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
tokenizer = BertTokenizer.from_pretrained('UBC-NLP/MARBERT')

In [21]:
def tokenize_and_preserve_labels(sentence, text_labels):
  tokenized_sentence = []
  labels = []
  for word, label in zip(sentence, text_labels):

      # Tokenize the word and count # of subwords the word is broken into
      tokenized_word = tokenizer.tokenize(word)
      n_subwords = len(tokenized_word)

      # Add the tokenized word to the final tokenized word list
      tokenized_sentence.extend(tokenized_word)

      # Add the same label to the new list of labels `n_subwords` times
      labels.extend([label] * n_subwords)

  return tokenized_sentence, labels

In [22]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [24]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [25]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [26]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [27]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [28]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=40, test_size=0.2)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=40, test_size=0.2)

In [29]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [30]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
model = BertForTokenClassification.from_pretrained(
    "UBC-NLP/MARBERT",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

In [37]:
model.cuda();

In [38]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


In [39]:
from transformers import get_linear_schedule_with_warmup

epochs = 5
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [40]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
#loss_fn = nn.CrossEntropyLoss(weight=weights.cuda())
for _ in trange(epochs, desc="Epoch"):
  # ========================================
  #               Training
  # ========================================
  # Perform one full pass over the training set.

  # Put the model into training mode.
  model.train()
  # Reset the total loss for this epoch.
  total_loss = 0

  # Training loop
  for step, batch in enumerate(train_dataloader):
    # add batch to gpu
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    # Always clear any previously calculated gradients before performing a backward pass.
    model.zero_grad()
    # forward pass
    # This will return the loss (rather than the model output)
    # because we have provided the `labels`.
    outputs = model(b_input_ids, token_type_ids=None,
                    attention_mask=b_input_mask,labels=b_labels)

    loss = outputs[0]
    loss.backward()
    # track train loss
    total_loss += loss.item()
    # Clip the norm of the gradient
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
    # update parameters
    optimizer.step()
    # Update the learning rate.
    scheduler.step()

  # Calculate the average loss over the training data.
  avg_train_loss = total_loss / len(train_dataloader)
  print("\nAverage train loss: {}".format(avg_train_loss))

  # Store the loss value for plotting the learning curve.
  loss_values.append(avg_train_loss)


  # ========================================
  #               Validation
  # ========================================
  # After the completion of each training epoch, measure our performance on
  # our validation set.

  # Put the model into evaluation mode
  model.eval()
  # Reset the validation loss for this epoch.
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  predictions , true_labels = [], []
  for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients,
    # saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions.
        # This will return the logits rather than the loss because we have not provided labels.
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask, labels=b_labels)
    # Move logits and labels to CPU
    logits = outputs[1].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences.
    eval_loss += outputs[0].mean().item()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

  eval_loss = eval_loss / len(valid_dataloader)
  validation_loss_values.append(eval_loss)
  print("\nValidation loss: {}".format(eval_loss))
  pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
  valid_tags = [tag_values[l_i] for l in true_labels
                                for l_i in l if tag_values[l_i] != "PAD"]
  print("\nValidation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
  print()


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]


Average train loss: 0.9399227316562946


Epoch:  20%|██        | 1/5 [00:03<00:15,  3.99s/it]


Validation loss: 0.3137972792610526

Validation Accuracy: 0.8421052631578947


Average train loss: 0.3370623978284689


Epoch:  40%|████      | 2/5 [00:07<00:11,  3.92s/it]


Validation loss: 0.17008583713322878

Validation Accuracy: 0.9258373205741627


Average train loss: 0.18780472874641418


Epoch:  60%|██████    | 3/5 [00:11<00:07,  3.91s/it]


Validation loss: 0.11075327708385885

Validation Accuracy: 0.9425837320574163


Average train loss: 0.09925885670460187


Epoch:  80%|████████  | 4/5 [00:15<00:03,  3.89s/it]


Validation loss: 0.12014660390559584

Validation Accuracy: 0.9545454545454546


Average train loss: 0.08603787823365285


Epoch: 100%|██████████| 5/5 [00:19<00:00,  3.90s/it]


Validation loss: 0.0982126557501033

Validation Accuracy: 0.9641148325358851






In [41]:
test = pd.read_csv('/content/test.csv')

In [42]:
test

Unnamed: 0,text,annotations
0,الرسالة من دولة اخرى بكام,"[{'start': 0, 'end': 7, 'entity': 'unit'}]"
1,ما هو استهلاك رقم 01100000022,"[{'start': 18, 'end': 29, 'entity': 'phonenumb..."
2,خليهم للرقم ده 01100000014,"[{'start': 15, 'end': 26, 'entity': 'phonenumb..."
3,حولي منه 50 رسالة,"[{'start': 9, 'end': 11, 'entity': 'amount'}, ..."
4,عايزة اعرف استهلاكي للدقايق,"[{'start': 20, 'end': 27, 'entity': 'unit'}]"
...,...,...
95,عايز احول للرقم ده 01100009011,"[{'start': 20, 'end': 31, 'entity': 'phonenumb..."
96,استهلاك رقم 01199000012,"[{'start': 12, 'end': 23, 'entity': 'phonenumb..."
97,عايز اتصل ب رقم 01100000019,"[{'start': 16, 'end': 27, 'entity': 'phonenumb..."
98,اتصلي ب 01100000099,"[{'start': 8, 'end': 19, 'entity': 'phonenumbe..."


In [43]:
test.annotations = test.annotations.apply(lambda annotations : ast.literal_eval(annotations))

In [44]:
def pre_process_sentence(sentence,annotations):
  lst = {}
  keyorder = sentence.split()
  for i in annotations:
    sen = sentence[i['start']:i['end']].split()
    if len(sen) > 1:
      for l in range(len(sen)):
        if l == 0:
          lst[sen[l]] =i['entity']#'B-'+i['entity']
        elif l == len(sen)-1:
          lst[sen[l]] =i['entity']# 'E-'+i['entity']
        else:
          lst[sen[l]] = i['entity']#'I-'+i['entity']
    else:
      lst["".join(sen)] = i['entity']

  for word in sentence.split():
    if word not in lst.keys() and word+' ' not in lst.keys():
      lst[word] = 'O'     
  lst = sorted(lst.items(), key=lambda i:keyorder.index(i[0]))
  #lst = [('sent'+str(sen_num),) + i for i in lst]
  return lst

In [45]:
s = pre_process_sentence(test.text.to_list()[30],test.annotations.to_list()[30])
s

[('انا', 'O'),
 ('مسافر', 'O'),
 ('علي', 'O'),
 ('تركيا', 'country'),
 ('الاسبوع', 'O'),
 ('القادم', 'O'),
 ('لمده', 'O'),
 ('اسبوع', 'O')]

In [46]:
%%time
for i in range(len(test.text.to_list())):
  lst = pre_process_sentence(test.text.to_list()[i],test.annotations.to_list()[i])
  if i == 0:
    df_test = pd.DataFrame(lst, columns=["Word", "Tag"])
  else:
    df2 = pd.DataFrame(lst, columns=["Word", "Tag"])
    df_test= df_test.append(df2)

CPU times: user 124 ms, sys: 5.31 ms, total: 130 ms
Wall time: 126 ms


In [48]:
df_test

Unnamed: 0,Word,Tag
0,الرسالة,unit
1,من,O
2,دولة,O
3,اخرى,O
4,بكام,O
...,...,...
2,01100000099,phonenumber
0,ممكن,O
1,تعمل,O
2,اتصال,O


In [49]:
all_tokens = []
all_labels = []
for i in range(100):
  print(test.text.to_list()[i])
  print(test.annotations.to_list()[i])
  tokenized_sentence = tokenizer.encode(test.text.to_list()[i])
  input_ids = torch.tensor([tokenized_sentence]).cuda()
  with torch.no_grad():
    output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
      if token.startswith("##"):
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)
  for token, label in zip(new_tokens, new_labels):
      if token == '[CLS]' or token == '[SEP]':
        continue
      else:
        print("{}\t{}".format(label, token))
        all_tokens.append(token)
        all_labels.append(label)
  print('*'*100)

الرسالة من دولة اخرى بكام
[{'start': 0, 'end': 7, 'entity': 'unit'}]
type	الرسالة
O	من
O	دولة
O	اخرى
O	بكام
****************************************************************************************************
ما هو استهلاك رقم 01100000022
[{'start': 18, 'end': 29, 'entity': 'phonenumber'}]
O	ما
O	هو
O	استهلاك
O	رقم
phonenumber	01100000022
****************************************************************************************************
خليهم للرقم ده 01100000014
[{'start': 15, 'end': 26, 'entity': 'phonenumber'}]
O	خليهم
O	للرقم
O	ده
phonenumber	01100000014
****************************************************************************************************
حولي منه 50 رسالة
[{'start': 9, 'end': 11, 'entity': 'amount'}, {'start': 12, 'end': 17, 'entity': 'unit'}]
O	حولي
O	منه
amount	50
type	رسالة
****************************************************************************************************
عايزة اعرف استهلاكي للدقايق
[{'start': 20, 'end': 27, 'entity': 'unit'}]
O	عايزة
O	اعرف
O	

In [50]:
df_predss = pd.DataFrame(list(zip(all_tokens, all_labels)),
               columns =['Word', 'Tag'])

In [51]:
df_predss.Tag.value_counts()

O              410
type            97
country         24
phonenumber     20
amount           5
Name: Tag, dtype: int64

In [52]:
df_test.Tag.value_counts()

O              406
unit            86
country         24
phonenumber     21
amount           7
Name: Tag, dtype: int64

In [53]:
df_predss[df_predss.Tag == 'country']

Unnamed: 0,Word,Tag
98,لتركيا,country
129,اﻷمارات,country
145,تركيا,country
158,تركيا,country
212,الامارات,country
237,تركيا,country
241,تركيا,country
253,السعودية,country
275,لبنان,country
370,الامارات,country
