In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import io
from time import time
import argparse
import GPUtil
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import pandas as pd
import os
from transformers import *


I1113 21:37:03.601632 47090899140096 file_utils.py:39] PyTorch version 1.5.0 available.
I1113 21:37:03.602525 47090899140096 file_utils.py:55] TensorFlow version 2.2.0 available.


## Load the prediction samples from tsv file. 

We provide a sample file to show the format. 

In [2]:
df = pd.read_csv("./narrow_country_b/inference_sample.tsv", delimiter='\t',header=0) 
## please change the file path to your corresponding path

In [3]:
df.head()

Unnamed: 0,tweet_content
0,ياغاليين علي انتوشذا عمري وانتم زهي عينيايلي ب...
1,USER اهلا مؤمل انا الحمد الله تمام انت كيف
2,USER الله يرحمه ويغمد روحه الجنه يارب
3,USER وانا برضه لسه عاملهم NUM بقول للدكتور ايه...
4,USER هه وين نروحو نسيبو المليح والدوني NUM اكت...


## Specify the device

In [4]:
if torch.cuda.is_available():
    print('Found GPU')
# If GPU is availble, the model will train on GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)

Found GPU
cuda


## Define a function for data preparation

In [5]:
def data_prepare(file_path, tokenizer, max_len = 32):
    '''
    file_path: the path to input file. 
                The input must be a tsv file that includes only one column that is tweet text content. 
                The first row must be header of column.

    lab2ind: dictionary of label classes
    tokenizer: BERT tokenizer
    max_len: maximal length of input sequence
    '''

    # if we are in predict mode, we will load one column (i.e., text).
    df = pd.read_csv(file_path, delimiter='\t',header=0, names=['content'])
    print("Data size ", df.shape)

        
    # Create tweet lists
    contents = df.content.values

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in contents]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]

    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
    print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks


## Define a function for inference

In [6]:
def inference(model, iterator, ind2label):
    
    model.eval()
    # output lists
    output_prob=[]
    output_label = []

    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            input_ids, input_mask = batch
            
            outputs = model(input_ids, input_mask)
            logits = outputs[0]

            prob = F.softmax(logits, dim=1)
            # delete used variables to free GPU memory
            del batch, input_ids, input_mask

            # identify the predicted class and the probability
            probabilities, predicted = torch.max(prob.cpu().data, 1)

            # put the probability of the predicted label to a list 
            output_prob.extend(probabilities.tolist())

            # put all predicted labels to a list
            output_label.extend([ind2label[pred] for pred in predicted.tolist()])
            
    return output_label, output_prob

## Load the label-to-index dictionary and create index-to-label dictionary

In [7]:
tmp_file = open(os.path.join("./narrow_country_b/", "country2ind.json")) 
## please change the file path to your corresponding path
lab2ind= json.load(tmp_file)
tmp_file.close()
# create index-to-label dictionary
ind2label= {v:k for k,v in lab2ind.items()}

In [8]:
print("Number of labels: ", len(ind2label))

Number of labels:  11


## Specify the maximum sequence length for the input.

We use a maximum sequence length of 50 words in fine-tuning.

In [9]:
max_seq_length = 50

## Load the fine-tuned MARBERT checkpoint and vocabulary

In [10]:
# give the model path where includes: config.json, vocab.txt, and pytorch_model.bin
model_path = "./narrow_country_b/"
## please change the file path to your corresponding path

# load tokenizer from pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)

# Load fine-tuned MARBERT model
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(lab2ind))

# send model to device CPU or GPU
model.to(device)

I1113 21:37:07.543288 47090899140096 tokenization_utils_base.py:1167] Model name './narrow_country_b/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming './narrow_country_b/' is a path, a model identifier, or url to a directory containing tokenizer files.
I1113 21:37:07.544422 47090899140096 tokenization_utils_base.py:1197] Didn't find file ./narrow_country_b/added_tokens.json. We won't load it.
I1113 21:37:07.545130 47090899140096 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

## Load inference dataset

In [11]:
# Use defined funtion to extract data
inf_inputs, inf_masks = data_prepare("./narrow_country_b/inference_sample.tsv", tokenizer, max_seq_length)
## please change the file path to your corresponding path

# Select a batch size
batch_size = 32

# create a dataloader
inf_data = TensorDataset(inf_inputs, inf_masks)
inf_dataloader = DataLoader(inf_data, batch_size = batch_size, shuffle= False)
                                     

Data size  (10, 1)
Tokenize the first sentence:
 ['[CLS]', 'ياغالي', '##ين', 'علي', 'انتو', '##ش', '##ذا', 'عمري', 'وانتم', 'زه', '##ي', 'عينيا', '##يلي', 'بعيدين', 'بعد', 'الوط', '##ا', 'عالش', '##مس', 'ونجوم', 'الثريا', '##ويز', '##هي', 'دليلي', 'لما', 'نجي', '##كم', 'مشتاق', 'ليكم', 'قبل', 'المساء', 'ودي', 'نجي', '##كم', '[SEP]']
Index numbers of the first sentence:
 [2, 19789, 1943, 1998, 6153, 1008, 1999, 3602, 4213, 5880, 1015, 37479, 10162, 44617, 2112, 3638, 1011, 37700, 2297, 41830, 41867, 29312, 2376, 82750, 2273, 29446, 2002, 6183, 22199, 2331, 4469, 3990, 29446, 2002, 3]
Index numbers of the first sentence after padding:
 [    2 19789  1943  1998  6153  1008  1999  3602  4213  5880  1015 37479
 10162 44617  2112  3638  1011 37700  2297 41830 41867 29312  2376 82750
  2273 29446  2002  6183 22199  2331  4469  3990 29446  2002     3     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [12]:
output_label, output_prob = inference(model, inf_dataloader, ind2label)

In [13]:
print(output_label)
print(output_prob)

['ma', 'ae', 'sa', 'lb', 'sa', 'lb', 'sa', 'ae', 'sa', 'ae']
[0.3575429320335388, 0.6929617524147034, 0.3960374593734741, 0.7135670185089111, 0.4990292489528656, 0.7554435133934021, 0.9676510095596313, 0.9226142168045044, 0.4678232669830322, 0.954530656337738]


In [14]:
for i in range(0,len(output_label)):
    print("Sample {} is predicted as {} with {} of confidence.".format(i+1, output_label[i], output_prob[i]))

Sample 1 is predicted as ma with 0.3575429320335388 of confidence.
Sample 2 is predicted as ae with 0.6929617524147034 of confidence.
Sample 3 is predicted as sa with 0.3960374593734741 of confidence.
Sample 4 is predicted as lb with 0.7135670185089111 of confidence.
Sample 5 is predicted as sa with 0.4990292489528656 of confidence.
Sample 6 is predicted as lb with 0.7554435133934021 of confidence.
Sample 7 is predicted as sa with 0.9676510095596313 of confidence.
Sample 8 is predicted as ae with 0.9226142168045044 of confidence.
Sample 9 is predicted as sa with 0.4678232669830322 of confidence.
Sample 10 is predicted as ae with 0.954530656337738 of confidence.
