In [58]:
import pandas as pd
import torch
from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          XLNetConfig,XLNetForSequenceClassification,XLNetTokenizer,
                          RobertaConfig,RobertaForSequenceClassification,RobertaTokenizer
                          )
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from box import Box

In [15]:
train = pd.read_csv('/home/tony/NLP_task/task1/train.tsv',delimiter='\t',header=None,names=['ids','label','alpha','sentence'])
val = pd.read_csv('/home/tony/NLP_task/task1/val.tsv',delimiter='\t',header=None,names=['ids','label','alpha','sentence'])

In [12]:
train.head()

Unnamed: 0,ids,label,alpha,sentence
0,703,0,a,Yet when she and her husband put the house up ...
1,8632,0,a,We will compare Wyndham Destinations to relate...
2,3699,0,a,"With the appropriate safeguards, controls, and..."
3,7543,0,a,Thomas Cook had a rescue deal in place with in...
4,9266,0,a,HomeTrading for BeginnersMarket AnalysisThe St...


In [16]:
train_sents, train_labels = train.sentence.values, train.label.values
val_sents, val_labels = val.sentence.values, val.label.values

In [67]:
import transformers

tokenizer_class = getattr(transformers, 'XLNetTokenizer')
tokenizer = tokenizer_class.from_pretrained('xlnet-base-cased', do_lower_case=True)

I0512 21:45:16.900147 139874644301568 tokenization_utils.py:1011] loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/tony/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8


In [25]:
def sent_tokenize(sentences, tokenizer, MAX_LEN=128):
    ids, attention_masks = [],[]
    for sent in sentences:
        encoded_sent = tokenizer.encode_plus(str(sent), add_special_tokens=True, max_length=MAX_LEN, 
                                             pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
        ids.append(encoded_sent['input_ids'])
        attention_masks.append(encoded_sent['attention_mask'])
    ids = torch.cat(ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return ids, attention_masks

def model_and_tokenizer(model_type, pretrain_weight):
    MODEL_CLASSES = {
        'bert': (BertForSequenceClassification, BertTokenizer),
        'xlm': (XLMForSequenceClassification, XLMTokenizer),
        'xlnet': (XLNetForSequenceClassification, XLNetTokenizer),
        'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer),
        'robetra': (RobertaForSequenceClassification, RobertaTokenizer)
    }
    model_class, tokenizer_class = MODEL_CLASSES[model_type]
    model = model_class.from_pretrained(
        pretrain_weight,
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False
    )
    tokenizer = tokenizer_class.from_pretrained(pretrain_weight, do_lower_case=True)
    return model, tokenizer

In [23]:
model, tokenizer = model_and_tokenizer('xlnet', 'xlnet-base-cased')

I0512 20:25:57.541467 139874644301568 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /home/tony/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.69e5e35e0b798cab5e473f253752f8bf4d280ee37682281a23eed80f6e2d09c6
I0512 20:25:57.542689 139874644301568 configuration_utils.py:321] Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "

In [36]:
train_inputs, train_masks = sent_tokenize(train_sents,tokenizer)
val_inputs, val_masks = sent_tokenize(val_sents,tokenizer)
train_labels =torch.tensor(train_labels).to(torch.int64)
val_labels =torch.tensor(val_labels).to(torch.int64)

In [69]:
print(train_inputs.size())
print(train_masks.size())
print(train_labels.size())

torch.Size([8669, 128])
torch.Size([8669, 128])
torch.Size([8669])


In [40]:
BATCH_SIZE=16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

In [47]:
iterator = iter(train_dataloader)
batch = next(iterator)
batch[1]

tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])

In [49]:
b_input_ids,b_mask,b_label = tuple(t for t in batch)

In [54]:
print(b_input_ids.size())
print(b_mask.size())
print(b_label.size())

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16])
