https://ripshun.com/2020/11/26/%E5%AE%9E%E6%88%98-%E4%BD%BF%E7%94%A8bert%E5%AE%9E%E7%8E%B0%E5%A4%9A%E5%88%86%E7%B1%BB/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
path_train = './drive/MyDrive/AI_&_EdgeComputing_Program/NLP/NLP專案/WSDM-testing dataset/train.csv'
path_test = './drive/MyDrive/AI_&_EdgeComputing_Program/NLP/NLP專案/WSDM-testing dataset/test.csv'

# 文件地址：https://www.kaggle.com/c/fake-news-pair-classification-challenge/data
# 模型形式：BERT + Linear Classifier
df_train = pd.read_csv(path_train)

#除空 dataframe 的masked 遮罩
empty_title = ((df_train['title2_zh'].isnull()) \
              | (df_train['title1_zh'].isnull()) \
              | (df_train['title2_zh'] == '') \
              | (df_train['title2_zh'] == '0'))
df_train = df_train[~empty_title] #~相反

# 去除過長樣本
MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]

# 只用 1% 的训练集，看看bert的强大
SAMPLE_FRAC = 0.01
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=6666)

# 去除沒用列
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

# 將结果另存成 tsv 供 pytorch 使用
df_train.to_csv("train.tsv", sep="\t", index=False)

print("训练样本数量：", len(df_train))
df_train.head()

训练样本数量： 2657


Unnamed: 0,text_a,text_b,label
0,晚上吃苹果就成毒苹果了吗,早上吃金苹果，晚上吃毒苹果，苹果真不能晚上吃吗？,agreed
1,吃酱油会变黑？伤口会留疤？,经常吃酱油会变黑，这件事终于有答案了！,agreed
2,加湿器加自来水堪比雾霾,华为金立OPPO：我们手机明年要涨价！网友：有小米就够了,unrelated
3,有谁希望丧尸病毒爆发，外星人入侵，世界巨,丧尸病毒爆发之后 逃上一个小岛是否是一个最好的方案,unrelated
4,山药好吃又营养，这么做还能补充维C、降血压,常吃这3种食物，把血液垃圾清理的一干二净，还能降低血压,unrelated


In [None]:
df_train.label.value_counts() / len(df_train)

unrelated    0.675574
agreed       0.292811
disagreed    0.031615
Name: label, dtype: float64

In [None]:
df_test = pd.read_csv(path_test)
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"]]
df_test.columns = ["text_a", "text_b", "Id"]
df_test.to_csv("test.tsv", sep="\t", index=False)

print("預測樣本數：", len(df_test))
df_test.head()

預測樣本數： 80126


Unnamed: 0,text_a,text_b,Id
0,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,321187
1,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,321190
2,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,321189
3,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,321193
4,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,321191


In [None]:
from torch.utils.data import Dataset
!pip install transformers tqdm boto3 requests regex -q
from transformers import BertTokenizer #詳見下一區塊，從huggingface github載入了BertTokenizer的 class information
!pip install pysnooper -q
import pysnooper

PRETRAINED_MODEL_NAME = "bert-base-chinese"  
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)#導入模型huggingface的pytorch-pretrained-BERT
#Bert tokenizer class中
class FakeNewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        #iterator=True pd讀取大文件方法
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  #使用 BERT tokenizer

    #@pysnooper.snoop()  # 加入以了解所有轉換過程
    def __getitem__(self, idx):
        if self.mode == "test": #若是test資料集
            text_a, text_b = self.df.iloc[idx, :2].values 
            #呼叫self的df (test檔)，包含idx欄位，取出0,1兩欄，用values轉成list 再依序assign給text_a, text_b
            label_tensor = None #因為test 沒有label

        else: #因為training dataset欄位已經處理過只剩text_a, text_b, label
                          #index數為idx，欄位全部取出(:) 並轉成List(values)
            text_a, text_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label] #label承接訓練集資料值，並將其轉成數字
            #self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
            
            label_tensor = torch.tensor(label_id) #將agreed, disagreed, unrelated 轉成張量
            #print(label_tensor) #label_tensor = tensor(0-2)

        word_pieces = ["[CLS]"] #設定list 裝CLS的字串
        tokens_a = self.tokenizer.tokenize(text_a) #將text_a欄用pytorch 的tokenize斷詞，存成tokens_a 
        word_pieces += tokens_a + ["[SEP]"] #tokens_a 變成 [CLS]內容[SEP]，暫存成word_pieces 變數
        len_a = len(word_pieces) #看word_pieces(tokens_a)的長度
        
        tokens_b = self.tokenizer.tokenize(text_b) #將text_b欄用pytorch 的tokenize斷詞，存成tokens_b 
        word_pieces += tokens_b + ["[SEP]"] #tokens_b 變成 [CLS]內容[SEP]，暫存成word_pieces 變數
        len_b = len(word_pieces) - len_a  #word_pieces(tokens_b)的長度 - word_pieces(tokens_a)的長度
        #???

        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        #使用def __getitem__(self, idx)中tokenizer的convert_tokens_to_ids的function，輸入word_pieces
        #此時word_pieces是啥? tokens_b? function 功能未知

        tokens_tensor = torch.tensor(ids) #torch function 功能未知
        
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
        #torch function 功能未知
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
trainset = FakeNewsDataset("train", tokenizer=tokenizer)
print(trainset)

[K     |████████████████████████████████| 2.5MB 28.8MB/s 
[K     |████████████████████████████████| 133kB 53.8MB/s 
[K     |████████████████████████████████| 3.3MB 47.5MB/s 
[K     |████████████████████████████████| 901kB 51.4MB/s 
[K     |████████████████████████████████| 7.7MB 33.6MB/s 
[K     |████████████████████████████████| 81kB 11.9MB/s 
[31mERROR: botocore 1.20.112 has requirement urllib3<1.27,>=1.25.4, but you'll have urllib3 1.24.3 which is incompatible.[0m
[?25h

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…


<__main__.FakeNewsDataset object at 0x7f15e5dd1a10>


因為上個block有import 到pytorch_pretrained_bert.tokenization中的BertTokenizer.from_pretrained函数。打开pytorch_pretrained_bert源代码，BertTokenizer类如下：

```
class BertTokenizer(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""

    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                              never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            raise ValueError(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
            )
        return ids

    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens
 
    @classmethod
    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
        else:
            vocab_file = pretrained_model_name
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except FileNotFoundError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer```



In [None]:
import torch
sample_idx = 0
print(trainset) #去看上面的式子class FakeNewsDataset(Dataset): 
#<__main__.FakeNewsDataset object at 0x7f43cc4cffd0>

text_a, text_b, label = trainset.df.iloc[sample_idx].values # df.iloc [0]第0 row筆資料轉成list
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx] 
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)
print(combined_text)

<__main__.FakeNewsDataset object at 0x7f15e5dd1a10>
[CLS]晚上吃苹果就成毒苹果了吗[SEP]早上吃金苹果，晚上吃毒苹果，苹果真不能晚上吃吗？[SEP]


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

#此函式由collate_fn呼叫，基本功能是傳入tokenized的張量與segment embedding進來，並完成zero padding到單句256
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples] #samples 參數傳入, sample = 裝已經轉向量的句子
    #print('samples',samples)
    segments_tensors = [s[1] for s in samples]
    #print('segments_tensors',segments_tensors)
    
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples]) #注意: torch.stack(dim =2)表示疊加在第3rd dim在第3rd dim
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 不為 zero padding 的位置設為1
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long) #torch.long = int 64
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 64


trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch) #一樣統一用batch 64 training data
#在此num_workers default = 0，若CPU能負荷，可開啟多線程，事先載入batch進RAM，training batch speed rising
#經驗上，預設為CPU核心數
# collate_fn上，我們參考論文與LeeMing前輩的文章，複製create_mini_batch function進來，自訂義我們batch的儲存方式


In [None]:
from transformers import BertForSequenceClassification
from IPython.display import clear_output

# # Prepare model
PRETRAINED_MODEL_NAME = "bert-base-chinese" #從google裡抓出"bert-base-chinese"參數, config
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()
#model.config

#我們呼叫的BertForSequenceClassification

1. 参数：

    config：指定的bert模型的預訓練參數

    num_labels：label (類別)的數量 (num)
2. 輸入：

    input_ids：訓練集，torch.LongTensors(int 64)，shape是[batch_size,sequence_length]

    **token_type_ids：optional，当训练集是两句话时才有**(Adopted)

    **attention_mask：optional，当使用mask才有** (Adopted)

    labels：Data labelled，torch.LongTensor類型，shape是[batch_size],同input_ids

3. 輸出：

    if labels != None（訓練時）：output = 是分類的crossentropy

    if labels == None（評價用）：output = 機率 且shape為[batch_size, num_labels]

```
#我們呼叫的BertForSequenceClassification
###BertForSequenceClassification class, code as below：


class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2, ...):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config, ...)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
          ...

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, ...):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, ...)
        ...
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        elif self.output_attentions:
            return all_attentions, logits
        return logit
```



# PreTrainedBertModel

(BertForSequenceClassification 為繼承自PreTrainedBertModel的子類)



```
class PreTrainedBertModel(nn.Module):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedBertModel, self).__init__()
        if not isinstance(config, BertConfig):
            raise ValueError(
                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
                "To create a model from a Google pretrained model use "
                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                    self.__class__.__name__, self.__class__.__name__
                ))
        self.config = config

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    @classmethod
    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
        """
        参数:
            预训练模型名称，可选: 
                    . `bert-base-uncased`
                    . `bert-large-uncased`
                    . `bert-base-cased`
                    . `bert-large-cased`
                    . `bert-base-multilingual-uncased`
                    . `bert-base-multilingual-cased`
                    . `bert-base-chinese
        """
        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
        else:
            archive_file = pretrained_model_name
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
        except FileNotFoundError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name,
                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                    archive_file))
            return None
        if resolved_archive_file == archive_file:
            logger.info("loading archive file {}".format(archive_file))
        else:
            logger.info("loading archive file {} from cache at {}".format(
                archive_file, resolved_archive_file))
        tempdir = None
        if os.path.isdir(resolved_archive_file):
            serialization_dir = resolved_archive_file
        else:
            # Extract archive to temp dir
            tempdir = tempfile.mkdtemp()
            logger.info("extracting archive file {} to temp dir {}".format(
                resolved_archive_file, tempdir))
            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
                archive.extractall(tempdir)
            serialization_dir = tempdir
        # Load config
        config_file = os.path.join(serialization_dir, CONFIG_NAME)
        config = BertConfig.from_json_file(config_file)
        logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(config, *inputs, **kwargs)
        if state_dict is None:
            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
            state_dict = torch.load(weights_path)

        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
            module._load_from_state_dict(
                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')
        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
        if len(missing_keys) > 0:
            logger.info("Weights of {} not initialized from pretrained model: {}".format(
                model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info("Weights from pretrained model not used in {}: {}".format(
                model.__class__.__name__, unexpected_keys))
        if tempdir:
            # Clean up temp dir
            shutil.rmtree(tempdir)
        return model
```



In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad(): #被torch.no_grad() wrapped的上下文不會被梯度下降
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors,token_type_ids=segments_tensors,attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.5724501317275122


In [None]:
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) #前期可設置大  後期(特定幾個epoch)可設小 
EPOCHS = 7
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 4.047, acc: 0.982
[epoch 2] loss: 3.284, acc: 0.982
[epoch 3] loss: 1.995, acc: 0.989
[epoch 4] loss: 1.595, acc: 0.992
[epoch 5] loss: 1.180, acc: 0.988
[epoch 6] loss: 1.059, acc: 0.988
[epoch 7] loss: 1.362, acc: 0.991


In [None]:
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

predictions = get_predictions(model, testloader)
index_map = {v: k for k, v in testset.label_map.items()}

df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], df.loc[:, 'Category']], axis=1)
df_pred.to_csv('bert_1_prec_training_samples.csv', index=False)
df_pred.head()

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,agreed
3,321193,unrelated
4,321191,unrelated


In [None]:
predictions = get_predictions(model, trainloader)
df = pd.DataFrame({"predicted": predictions.tolist()})
df['predicted'] = df.predicted.apply(lambda x: index_map[x])
df1 = pd.concat([trainset.df, df.loc[:, 'predicted']], axis=1)
disagreed_tp = ((df1.label == 'disagreed') & \
                (df1.label == df1.predicted) & \
                (df1.text_a.apply(lambda x: True if len(x) < 10 else False)))
df1[disagreed_tp].head()

Unnamed: 0,text_a,text_b,label,predicted
25,关于植物除甲醛,"医学博士拆穿，市面那些关于装修除甲醛谣言,毫无科学依据",disagreed,disagreed
1605,李天一已被安排出国,李天一即将提前出狱？官方辟谣：仍在服刑！,disagreed,disagreed
2491,李天一已被安排出国,北京市监狱管理局：李天一提前出狱消息不实系谣言，仍在监狱服刑,disagreed,disagreed
2571,沈阳两名女子偷孩子,两名女子偷孩子 沈阳网警辟谣：假的！,disagreed,disagreed


In [None]:
import numpy as np


text_a = "李天一已被安排出国"
text_b = "李天一即将提前出狱？官方：是的！"
word_pieces = ["[CLS]"] #開頭［CLS]
tokens_a = tokenizer.tokenize(text_a)#text_a 斷字
#print('tokens_a:', tokens_a)
word_pieces += tokens_a + ["[SEP]"]#[CLS]text_a已斷字
#print('word_pieces:', word_pieces)
len_a = len(word_pieces)  #len_a: 11
       
tokens_b = tokenizer.tokenize(text_b)
word_pieces += tokens_b + ["[SEP]"] ##[CLS]text_a已斷字[SEP]text_b
#print('word_pieces:', word_pieces)
len_b = len(word_pieces) - len_a #len_b: 17
#print('word_pieces length: ',len(word_pieces))


ids = tokenizer.convert_tokens_to_ids(word_pieces) #transfer word to vector 
#print('ids:\n',ids) #[101, 3330, 1921, 671, 2347, 6158,... 4328, 8043, 2135, 3175, 8038, 3221, 4638, 8013, 102], 共28個

# c = np.array(ids)
#print('ids尚未經過unsqueeze前的shapes: ',c.shape) #result: (28,)一維

tokens_tensor = torch.tensor(ids).unsqueeze(0)#unsqueeze再給定參數位置插入一維(28,)成二維 (1,28) 裡面放ids數字 
#小寫tensor吃標量\向量但不吃維度
#print('tokens_tensor:',tokens_tensor)  
print('tokens_tensor.shape:',tokens_tensor.shape) #(1,28)
print(type(tokens_tensor)) 


"""0與１"""
#torch.tensor的用途於將list 2 tensor, unsqueeze(0)則是增加dim, torch.long = torch.int64


#segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long) 
#print('shape of segments_tensor without unsqueeze:',segments_tensor.shape) ----> torch.Size([28])
#ex. tensor([0, 0, 0, 0,...., 0, 0, 0, 0, 1, 1,....,1, 1])

segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long).unsqueeze(0) #len_a =11, len_b = 17
#補上11個零的1D list與17個1的一為向量用加字號concate成list(1D)再用unsqueeze(0)轉成2D張量　#segment_tensor.shape = [1,28] 再unsqueeze to 張量(tensor)
print('shape of segments_tensor:',segments_tensor.shape)
##segments_tensor = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


masks_tensors = torch.zeros(tokens_tensor.shape,dtype=torch.long) #([[1,28]], int 64)
print('Before mask: \n',masks_tensors) #2D:28個0 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


masks_tensors = masks_tensors.masked_fill(tokens_tensor != 0, 1).unsqueeze(0) #若tokens_tensor 裡面數值(ids) !=0， 回傳1 ，unsqueeze [1, 28]轉3 dim張量torch.Size([1, 1, 28])
print('After mask: \n',masks_tensors)
print('After mask shape: \n',masks_tensors.shape)

outputs = model(input_ids=tokens_tensor.to(device),token_type_ids=segments_tensor.to(device),attention_mask=masks_tensors.to(device)) #將所有最開始讀取數據時的tensor變量copy一份到device所指定的GPU上去，之後的運算都在GPU上進行。
#多CPU寫法 : device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print('\n outputs: ',outputs) 
#outputs = SequenceClassifierOutput(loss=None, logits=tensor([[ 0.8479, -0.9631, -0.9965]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

logits = outputs[0] #取出[[ 0.8479, -0.9631, -0.9965]], device='cuda:0', grad_fn=<AddmmBackward>)
#print(f'\n logits.data:{logits.data}') #取出機率值[[ 0.6375, -0.6297, -0.3883]], device='cuda:0'

_, pred = torch.max(logits.data, 1) #return 每一行中最大值的那個元素，且返回其索引

label_map = {0:'agreed', 1: 'disagreed', 2: 'unrelated'}
# print(f'\n pred.cpu(): {pred.cpu()}')
print(f'\n outputs:{outputs}')
print(label_map[pred.cpu().tolist()[0]]) #pred.cpu() = tensor([0]) 轉tensor to list




tokens_tensor.shape: torch.Size([1, 28])
<class 'torch.Tensor'>
shape of segments_tensor: torch.Size([1, 28])
Before mask: 
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])
After mask: 
 tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1]]])
After mask shape: 
 torch.Size([1, 1, 28])

 outputs:SequenceClassifierOutput(loss=None, logits=tensor([[ 2.1363, -1.2171, -0.5522]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
agreed
