In [2]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)

PyTorch 版本： 1.12.1


In [3]:

#torch.cuda.is_available()


# torch.cuda.device_count()


#torch.cuda.current_device()


#torch.cuda.device(0)

torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1650 SUPER'

In [4]:

# import glob
# import json

# files = os.path.join("D:/nlp/restaurants/restaurants/", "*.csv")

# # list of merged files returned
# files = glob.glob(files)

# print("Resultant CSV after joining all CSV files at a particular location...");

# # joining files with concat and read_csv
# df = pd.concat(map(pd.read_csv, files), ignore_index=True)
# print(df)

# df.to_csv("merge.csv")


In [4]:
df = pd.read_csv("0917.csv")
df = df[['comment', 'category']]

df = df.dropna()
df.head()

Unnamed: 0,comment,category
0,非常難找車位,5.0
1,好吃好喝好讚,1.0
2,經濟實惠,4.0
3,店由一對老夫婦經營很親切慈祥,3.0
4,價格實惠,4.0


In [5]:
df2 = pd.read_csv("0916_1.csv")
df2 = df2[['comment', 'category']]

df2 = df2.dropna()
df2.head()

Unnamed: 0,comment,category
1,[\蜂蜜香蕉冰沙這杯清爽的水果冰沙，滿適合不喝咖啡因的朋友，可以喝到濃郁的香蕉味，,1.0
2,"\""飲品價格110元起餐點則是大約在250元上下算是中等價位",4.0
3,"\""不過蜂蜜應該只是輔佐而已，味道不太明顯。\""",1.0
4,"\""焦糖摩卡冰沙點之前原本考慮了一下，擔心喝起來超甜膩，但想不到意外的好喝，裡面還可以吃到不...",1.0
5,"\""不過看到別人點的潛艇堡感覺真可口，害我口水直流啊。原味舒芙蕾原本送來的舒芙蕾很澎，是我們...",1.0


In [6]:
df3 = pd.concat([df, df2], axis=0, ignore_index=True)
df3 = df3.dropna()

df3.tail(10)

df3 = df3[~(df3.comment.apply(lambda x : len(x)) > 40)]

df3.to_csv("train.csv", index = False)

# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
df3.to_csv("train.tsv", sep="\t", index=False)
df3.category.value_counts() / len(df3)

2.0    0.325855
3.0    0.317308
1.0    0.172009
4.0    0.129274
5.0    0.055556
Name: category, dtype: float64

In [9]:
# df_test = df.sample(frac=0.1, random_state=555)
# df_test
# df_train = df[~(df_test)]


In [7]:
df_test = pd.read_csv("test.csv")
df_test = df_test.loc[:, ["comment", "category"]]
df_test = df_test.dropna()
df_test = df_test[~(df_test.comment.apply(lambda x : len(x)) > 40)]
df_test.to_csv("test.tsv", sep="\t", index=False)

print("預測樣本數：", len(df_test))
df_test.head()

預測樣本數： 140


Unnamed: 0,comment,category
0,杯子喝到有抹布的味道,1
1,醬料區桌上也都擦拭的很乾淨,2
2,環境整體來說都很乾淨舒適,2
3,服務及態度都非常好,3
4,但外部店員服務的態度一樣親切,3


In [8]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
 
    
class FakeNewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = { 1 : 1, 2 : 2, 3 : 3, 4 : 4, 5: 0}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "train1":
            comment = self.df.iloc[idx, :1].values
            label_tensor = None
        else:
            comment, category = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[category]
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(comment)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)

        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a , dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

In [9]:
sample_idx = 300

# 將原始文本拿出做比較
comment, category = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

print(f"""[原始文本]
句子 1：{comment}

分類  ：{category}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：有小小的區域劃分（有包廂感）

分類  ：2.0

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 3300, 2207, 2207, 4638, 1281, 1818, 1205, 1146, 8020, 3300, 1259,
        2439, 2697, 8021,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：2

--------------------

[還原 tokens_tensors]
[CLS]有小小的區域劃分（有包廂感）[SEP]



In [10]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 32
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [11]:
# 載入一個可以做中文多分類任務的模型，n_class = 5
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 5

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
# print("""
# name            module
# ----------------------""")
# for name, module in model.named_children():
#     if name == "bert":
#         for n, _ in module.named_children():
#             print(f"{name}:{n}")
#     else:
#         print("{:15} {}".format(name, module))

In [22]:
model.config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache

In [12]:
"""
定義一個可以針對特定 DataLoader 取得模型預測結果以及分類準確度的函式
之後也可以用來生成上傳到 Kaggle 競賽的預測結果


"""

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.20085470085470086


In [13]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：102271493
線性分類器的參數量：3845



In [20]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 2  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))
    

[epoch 1] loss: 1.368, acc: 0.990
[epoch 2] loss: 0.754, acc: 0.998
CPU times: total: 37.6 s
Wall time: 38 s


In [15]:
sample_idx = 1
testset = FakeNewsDataset("test", tokenizer=tokenizer)

# 將原始文本拿出做比較
comment, category = testset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = testset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

print(f"""[原始文本]
句子 1：{comment}

分類  ：{category}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：醬料區桌上也都擦拭的很乾淨

分類  ：2

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 7016, 3160, 1281, 3430,  677,  738, 6963, 3092, 2887, 4638, 2523,
         746, 3912,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：2

--------------------

[還原 tokens_tensors]
[CLS]醬料區桌上也都擦拭的很乾淨[SEP]



In [21]:
%%time
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=32, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
t, predictions = get_predictions(model, testloader, compute_acc=True)
print(predictions)

# 用來將預測的 label id 轉回 label 文字
#index_map = {v: k for k, v in testset.label_map.items()}

# 生成 Kaggle 繳交檔案
# df = pd.DataFrame({"Category": predictions.tolist()})
# df['Category'] = df.Category.apply(lambda x: index_map[x])
# df_pred = pd.concat([testset.df.loc[:, ["Id"]], 
#                           df.loc[:, 'Category']], axis=1)
#df_pred.to_csv('bert_1_prec_training_samples.csv', index=False)
#df_pred.head()

0.9571428571428572
CPU times: total: 969 ms
Wall time: 972 ms


In [22]:

model2 = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=1e-5)

checkpoint = torch.load('check.pt')
model2.load_state_dict(checkpoint['model_state_dict'])
optimizer2.load_state_dict(checkpoint['optimizer_state_dict'])
epoch2 = checkpoint['epoch']
loss2 = checkpoint['loss']

torch.save(model, '0917.pt')

# model2.load_state_dict(torch.load('state.pt'))



In [49]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, 'check.pt')

In [18]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

