# 事前学習済みのベクトルを得る

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange

import run_classifier

from pytorch_pretrained_bert.modeling import BertForSequenceClassification

from pytorch_pretrained_bert.tokenization import BertTokenizer

In [2]:
args = {
    'bert_model':'bert-base-uncased',
    'local_rank':-1,
    'data_dir':'glue_data/ARD/',
    'max_seq_length':128,
    'train_batch_size':32,
}

In [3]:
processor = run_classifier.ArdProcessor()

In [4]:
tokenizer = BertTokenizer.from_pretrained(args['bert_model'])

12/07/2018 21:43:54 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/watarukudo/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [5]:
model = BertForSequenceClassification.from_pretrained(args["bert_model"],
                cache_dir='./models/pretrained_model_{}'.format(args["local_rank"]))

12/07/2018 21:43:55 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at ./models/pretrained_model_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
12/07/2018 21:43:55 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file ./models/pretrained_model_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/hn/53g2k8zj2zx2p_17pryml5zh0000gn/T/tmphtzhqh0l
12/07/2018 21:43:58 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 1

In [6]:
train_examples = processor.get_train_examples(args["data_dir"])

label_list = processor.get_labels()

In [7]:
train_features = run_classifier.convert_examples_to_features(
            train_examples, label_list, args['max_seq_length'], tokenizer)

12/07/2018 21:44:04 - INFO - run_classifier -   *** Example ***
12/07/2018 21:44:04 - INFO - run_classifier -   guid: train-1
12/07/2018 21:44:04 - INFO - run_classifier -   tokens: [CLS] this is an excellent camera . i have the powers ##hot a ##60 and have worked a bit with the a ##70 . the only difference on these cameras is that the a ##70 has more mega ##pi ##x ##els , other than that they are the same . the cameras offer various features , each with several different ways to manipulate the camera , including shutter speed and aperture . for the begin ##ner , it offers a few pre ##set options that allow a quick switch to the desired need of the user . the manual is very clear and easy to apply . i would recommend this camera to someone wanting a point and click camera , as well as a more advanced [SEP]
12/07/2018 21:44:04 - INFO - run_classifier -   input_ids: 101 2023 2003 2019 6581 4950 1012 1045 2031 1996 4204 12326 1037 16086 1998 2031 2499 1037 2978 2007 1996 1037 19841 1012 1

12/07/2018 21:44:04 - INFO - run_classifier -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/07/2018 21:44:04 - INFO - run_classifier -   label: 9 (id = 9)
12/07/2018 21:44:04 - INFO - run_classifier -   *** Example ***
12/07/2018 21:44:04 - INFO - run_classifier -   guid: train-5
12/07/2018 21:44:04 - INFO - run_classifier -   tokens: [CLS] i tried three different head ##phones : sen ##nh ##eis ##er 280 pro ##so ##und was very good , especially after some burn - in . but very uncomfortable . way to tight - - really cl ##amp down hard on your head . this produces great isolation , and they ' re well built , but this is useless if you can ' t wear them . note : i don ' t have a big head . audio tech ##nica at ##h - m3 ##0 ##the sound on these was just okay , and t

In [8]:
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

In [9]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [10]:
if args['local_rank'] == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)

In [11]:
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args["train_batch_size"])

In [12]:
device = torch.device("cpu")

In [14]:
batches = []
pooled_outputs = []
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
    batch = tuple(t.to(device) for t in batch)
    batches.append(batch)
    #input_ids, input_mask, segment_ids, label_ids = batch
    #pooled_output = model.get_pooled_output(input_ids, segment_ids, input_mask, label_ids)
    #pooled_outputs.append(pooled_output)


Iteration:   0%|          | 0/1250 [00:00<?, ?it/s][A
Iteration:   6%|▌         | 69/1250 [00:00<00:01, 686.31it/s][A
Iteration:  16%|█▌        | 194/1250 [00:00<00:01, 792.69it/s][A
Iteration:  26%|██▌       | 321/1250 [00:00<00:01, 893.37it/s][A
Iteration:  36%|███▋      | 454/1250 [00:00<00:00, 990.61it/s][A
Iteration:  46%|████▌     | 571/1250 [00:00<00:00, 1037.72it/s][A
Iteration:  58%|█████▊    | 731/1250 [00:00<00:00, 1159.64it/s][A
Iteration:  71%|███████   | 890/1250 [00:00<00:00, 1260.88it/s][A
Iteration:  83%|████████▎ | 1034/1250 [00:00<00:00, 1308.86it/s][A
Iteration:  96%|█████████▌| 1195/1250 [00:00<00:00, 1386.60it/s][A
Iteration: 100%|██████████| 1250/1250 [00:00<00:00, 1330.57it/s][A

In [17]:
input_ids, input_mask, segment_ids, label_ids = batches[0]

In [18]:
%time pooled_output = model.get_pooled_output(input_ids, segment_ids, input_mask, label_ids)

CPU times: user 20.8 s, sys: 3.08 s, total: 23.9 s
Wall time: 11.9 s


(32, 768)

In [None]:
train_data_tensors = train_data.tensors

In [None]:
train_data_tensors =  tuple(t.to(device) for t in train_data_tensors)

In [None]:
input_ids, input_mask, segment_ids, label_ids = train_data_tensors

In [None]:
pooled_output = model.get_pooled_output(input_ids, segment_ids, input_mask, label_ids)

In [None]:
from sklearn.model_selection import train_test_split
from sklean.neural_network import MLP

In [None]:
X = pooled_output.to_numpy()
y = np.array(label_ids)

In [None]:
X_train, X_test, y_train,  y_test = train_test_split()