In [1]:
import torch

if torch.cuda.is_available():    
   
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1060 6GB


## 2. Loading Data

In [2]:
import pickle as pkl
import pandas as pd

In [3]:
with open('./data/news_pickle/all_sentence_0716.pkl', 'rb') as f:
    df_all = pkl.load(f)

In [4]:
print('Number of training sentences: {:,}\n'.format(df_all.shape[0]))

df_all.sample(10)

Number of training sentences: 110,486



Unnamed: 0,sentences
82938,While the government was preparing to ban mass...
43227,PARIS (Reuters) - An 80-year-old Chinese touri...
106927,"Three Formula One Grand Prix, in Azerbaijan, S..."
19374,"Today the University of Washington in Seattle,..."
113350,"""I'm not just asking you to help New York,"" he..."
7710,There were 211 people on the initial manifest ...
44633,"Rospotrebnadzor, the country’s consumer safety..."
94668,"To avert a spike in unemployment, the governme..."
115311,"China will ban entry to all foreigners, includ..."
61872,Health officials and front-line medical staff ...


In [5]:
sentences = df_all.sentences.values

In [6]:
sentences

array(['The new Chinese virus which has already spread abroad "is still preventable and controllable", China says.',
       'Its National Health Commission warned, however, that close monitoring was needed given the source, transmission and mutation methods were unknown.',
       'Two people are known to have died from the respiratory illness which appeared in Wuhan city in December.',
       ...,
       'On Tuesday, the presidency announced it would release 1,420 prisoners in an amnesty to alleviate crowding in prisons.',
       'According to the statement, President Kais Saied also ordered increased sanitation measures in jails.',
       'Saied last week ordered the army to deploy in the streets to force people to respect the lockdown.Â\xa0'],
      dtype=object)

# 3. Tokenization & Input Formatting

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Add Token to Tokenizer

In [8]:
with open('./Data/wiki_entity/entity_list_0716.pkl','rb') as f:
    entities = pkl.load(f)

In [9]:
new_entity = []
for i, entity in enumerate(entities):
    if len(entity.split()) == 1:
        new_entity.append(entity)

In [10]:
num_added_toks = tokenizer.add_tokens(new_entity)

In [11]:
encoded_entity = tokenizer.convert_tokens_to_ids(new_entity)

In [None]:
print('Original: ', sentences[6])

print('Tokenized: ', tokenizer.tokenize(sentences[6]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[6])))

In [None]:
with open('./Data/tokenizer.pkl','wb') as f:
    pkl.dump(tokenizer, f)

## Tokenize Dataset

In [None]:
from tqdm import tqdm_notebook
import copy
import random
from random import randint

In [None]:
input_ids = []
attention_masks = []
label_masks = []

for sent in tqdm_notebook(sentences):
    try:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 16,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )

        
        input_id = encoded_dict['input_ids']

        
        label_mask = copy.deepcopy(encoded_dict['input_ids'])

        cond = 0
        for i, i_id in enumerate(input_id[0]):
            if i_id.item() in encoded_entity:
                if cond < 2:
                    input_id[0,i] = 103
                    cond += 1
                else:
                    break
                    
        if cond == 0:
            sen_len = sum(encoded_dict['attention_mask'][0]).item()
            sen_list = list(range(1,sen_len))
            random.seed(42)
            ran_list = random.sample(sen_list, 2)

            for j in ran_list:
                input_id[0,j] = 103
            
        elif cond == 1:
            sen_len = sum(encoded_dict['attention_mask'][0]).item()
            sen_list = list(range(1,sen_len))
            random.seed(42)
            ran_list = random.sample(sen_list, 1)
            
            while i in ran_list:
                ran_list = random.sample(sen_list, 1)
                
            for k in ran_list:
                input_id[0,k] = 103
                
        input_ids.append(input_id)
        label_masks.append(label_mask)

        attention_masks.append(encoded_dict['attention_mask'])
        
    except:
        print(sent)
        pass
    

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels=torch.cat(label_masks, dim= 0)


print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('label:', label_masks[0])

In [None]:
with open('./Data/encoded_token/en_tokens_50k.pkl','wb') as f:
    pkl.dump(input_ids, f)
    pkl.dump(attention_masks, f)
    pkl.dump(labels, f)