In [None]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import io
from time import time
import argparse
import GPUtil
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import os
from transformers import *


## Load the prediction samples from tsv file. 

We provide a sample file to show the format. 

In [None]:
df = pd.read_csv("./narrow_country_b/inference_sample.tsv", delimiter='\t',header=0) 
## please change the file path to your corresponding path

In [None]:
df.head()

## Specify the device

In [None]:
if torch.cuda.is_available():
    print('Found GPU')
# If GPU is availble, the model will train on GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)

## Define a function for data preparation

In [None]:
def data_prepare(file_path, tokenizer, max_len = 32):
    '''
    file_path: the path to input file. 
                The input must be a tsv file that includes only one column that is tweet text content. 
                The first row must be header of column.

    lab2ind: dictionary of label classes
    tokenizer: BERT tokenizer
    max_len: maximal length of input sequence
    '''

    # if we are in predict mode, we will load one column (i.e., text).
    df = pd.read_csv(file_path, delimiter='\t',header=0, names=['content'])
    print("Data size ", df.shape)

        
    # Create tweet lists
    contents = df.content.values

    # We need to add a special token at the beginning for BERT to work properly.
    content = ["[CLS] " + text for text in contents]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]

    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+['[SEP]'] for text in tokenized_texts]
    print ("Tokenize the first sentence:\n",tokenized_texts[0])
    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    print ("Index numbers of the first sentence:\n",input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
    print ("Index numbers of the first sentence after padding:\n",input_ids[0])

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks


## Define a function for inference

In [None]:
def inference(model, iterator, ind2label):
    
    model.eval()
    # output lists
    output_prob=[]
    output_label = []

    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            input_ids, input_mask = batch
            
            outputs = model(input_ids, input_mask)
            logits = outputs[0]

            prob = F.softmax(logits, dim=1)
            # delete used variables to free GPU memory
            del batch, input_ids, input_mask

            # identify the predicted class and the probability
            probabilities, predicted = torch.max(prob.cpu().data, 1)

            # put the probability of the predicted label to a list 
            output_prob.extend(probabilities.tolist())

            # put all predicted labels to a list
            output_label.extend([ind2label[pred] for pred in predicted.tolist()])
            
    return output_label, output_prob

## Load the label-to-index dictionary and create index-to-label dictionary

In [None]:
tmp_file = open(os.path.join("./narrow_country_b/", "label2ind.json")) 
## please change the file path to your corresponding path
lab2ind= json.load(tmp_file)
tmp_file.close()
# create index-to-label dictionary
ind2label= {v:k for k,v in lab2ind.items()}

In [None]:
print("Number of labels: ", len(ind2label))

## Specify the maximum sequence length for the input.

We use a maximum sequence length of 50 words in fine-tuning.

In [None]:
max_seq_length = 50

## Load the fine-tuned MARBERT checkpoint and vocabulary

In [None]:
# give the model path where includes: config.json, vocab.txt, and pytorch_model.bin
model_path = "./narrow_country_b/"
## please change the file path to your corresponding path

# load tokenizer from pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)

# Load fine-tuned MARBERT model
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(lab2ind))

# send model to device CPU or GPU
model.to(device)

## Load inference dataset

In [None]:
# Use defined funtion to extract data
inf_inputs, inf_masks = data_prepare("./narrow_country_b/inference_sample.tsv", tokenizer, max_seq_length)
## please change the file path to your corresponding path

# Select a batch size
batch_size = 32

# create a dataloader
inf_data = TensorDataset(inf_inputs, inf_masks)
inf_dataloader = DataLoader(inf_data, batch_size = batch_size, shuffle= False)
                                     

In [None]:
output_label, output_prob = inference(model, inf_dataloader, ind2label)

In [None]:
print(output_label)
print(output_prob)

In [None]:
for i in range(0,len(output_label)):
    print("Sample {} is predicted as {} with {} of confidence.".format(i+1, output_label[i], output_prob[i]))