# LLM-Bert
    - BertModel and tokenizer
    - Bert 是一个encoder-only的pretrained模型

In [64]:
import torch
from transformers import BertTokenizer, BertModel

In [2]:
import tqdm as notebook_tqdm

In [3]:
check_point = '/Users/hanlinwang/Downloads/bert-base-chinese/' 
tokenizer = BertTokenizer.from_pretrained(check_point)
model = BertModel.from_pretrained(check_point)

## Bert模型的参数
    - 三种embeddding层构成，包括了word_embedding(词嵌入)\token_type_embedding(多句情况下的分句标志)\position_embedding 

In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

## bert-base-uncased结构、参数设置
- BertModel(
  - (embeddings): BertEmbeddings(
    - (word_embeddings): Embedding(30522, 768, padding_idx=0)
    - (position_embeddings): Embedding(512, 768)
    - (token_type_embeddings): Embedding(2, 768)
    - (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    - (dropout): Dropout(p=0.1, inplace=False)
  - )
  - (encoder): BertEncoder(
    - (layer): ModuleList(
      - (0-11): 12 x BertLayer(
        - (attention): BertAttention(
          - (self): BertSelfAttention(
            - (query): Linear(in_features=768, out_features=768, bias=True)
            - (key): Linear(in_features=768, out_features=768, bias=True)
            - (value): Linear(in_features=768, out_features=768, bias=True)
            - (dropout): Dropout(p=0.1, inplace=False)
          - )
          - (output): BertSelfOutput(
            - (dense): Linear(in_features=768, out_features=768, bias=True)
            - (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            - (dropout): Dropout(p=0.1, inplace=False)
          - )
        - )
        - (intermediate): BertIntermediate(
          - (dense): Linear(in_features=768, out_features=3072, bias=True)
          - (intermediate_act_fn): GELUActivation()
        - )
        - (output): BertOutput(
          - (dense): Linear(in_features=3072, out_features=768, bias=True)
          - (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          - (dropout): Dropout(p=0.1, inplace=False)
        - )
      - )
    - )
  - )
  - (pooler): BertPooler(
    - (dense): Linear(in_features=768, out_features=768, bias=True)
    - (activation): Tanh()
  - )
- )

## 样例测试
    - 经过tokenizer之后的变量的类型以及它的属性
    - 属性有：
        - input_ids: list 字编码
        - attention_mask: list 掩码
        - token_type_ids: list 分词编码

In [43]:
text = 'what is the fox,that is fork.'
encoder_input = tokenizer(text, return_tensors='pt') # 这里要注意

In [46]:
encoder_input

{'input_ids': tensor([[ 101, 2054, 2003, 1996, 4419, 1010, 2008, 2003, 9292, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [44]:
encoder_outputs = model(**encoder_input,output_hidden_states=True)

In [45]:
print(encoder_outputs.pooler_output.size(),encoder_outputs.last_hidden_state.size())

torch.Size([1, 768]) torch.Size([1, 11, 768])


### token embedding

In [74]:
input_ids = encoder_input['input_ids']
input_ids

tensor([[ 101, 2054, 2003, 1996, 4419,  102]])

In [75]:
token_embed = model.embeddings.word_embeddings(input_ids)
token_embed

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [ 0.0387,  0.0035, -0.0619,  ...,  0.0192, -0.0217, -0.0888],
         [-0.0360, -0.0246, -0.0257,  ...,  0.0034, -0.0018,  0.0269],
         [-0.0446,  0.0061, -0.0022,  ..., -0.0363, -0.0004, -0.0306],
         [-0.0700, -0.0145, -0.0065,  ..., -0.0648, -0.0418, -0.0185],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [76]:
token_embed.shape

torch.Size([1, 6, 768])

### segment embedding \ token_type_embedding

In [77]:
token_type_embed = encoded_input['token_type_ids']
token_type_embed

tensor([[0, 0, 0, 0, 0, 0, 0, 0]])

In [78]:
segment_embed = model.embeddings.token_type_embeddings(token_type_embed)
segment_embed

tensor([[[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         ...,
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
         [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]]],
       grad_fn=<EmbeddingBackward0>)

In [79]:
segment_embed.shape

torch.Size([1, 8, 768])

### position embedding

In [80]:
pos_ids = torch.arange(input_ids.shape[1])
pos_ids

tensor([0, 1, 2, 3, 4, 5])

In [81]:
pos_embed = model.embeddings.position_embeddings(pos_ids)
pos_embed

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        [-4.1949e-03, -1.1852e-02, -2.1180e-02,  ...,  2.2455e-02,
          5.2826e-03, -1.9723e-03],
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03]], grad_fn=<EmbeddingBackward0>)

In [82]:
pos_embed.shape

torch.Size([6, 768])

### input embed

In [83]:
input_embed = token_embed + segment_embed + pos_embed

RuntimeError: The size of tensor a (6) must match the size of tensor b (8) at non-singleton dimension 1

# textclassfication

In [67]:
import pandas as pd
train_data=pd.read_csv('/Users/hanlinwang/Documents/GitHub/hugg-llm/data/cnews/cnews_train.txt',sep='\t',names=['label','content'])
# test_data=pd.read_csv('cnews/cnews.test.txt',sep='\t',names=['content'])

In [82]:
train_data['len'] = [len(train_data.loc[i,'content']) for i in range(len(train_data))]

In [95]:
train_data[train_data['len']>512]['len'].count()

31252

In [88]:
train_data['len'].value_counts().sort_values(ascending=False)

len
93      71
78      68
77      68
106     61
74      60
        ..
4385     1
4885     1
3736     1
5587     1
3212     1
Name: count, Length: 4025, dtype: int64

In [61]:
import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer


class MLMDataLoader:
    def __init__(
        self,
        dataset,
        batch_size=16,
        max_length=512,
        shuffle=True,
        drop_last=True,
        device=None,
        tokenizer_name='bert-base-chinese'
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.dataset = dataset
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.drop_last = drop_last

        if device is None:
            self.device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu'
            )
        else:
            self.device = device

        self.loader = DataLoader(
            dataset=self.dataset,
            batch_size=self.batch_size,
            collate_fn=self.collate_fn,
            shuffle=self.shuffle,
            drop_last=self.drop_last
        )

    def collate_fn(self, data):
        sents = [i[0] for i in data]
        labels = [i[1] for i in data]

        data = self.tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=sents,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_length=True
        )
        input_ids = data['input_ids'].to(self.device)
        attention_mask = data['attention_mask'].to(self.device)
        token_type_ids = data['token_type_ids'].to(self.device)
        labels = torch.LongTensor(labels).to(self.device)

        return input_ids, attention_mask, token_type_ids, labels

    def __iter__(self):
        for data in self.loader:
            yield data

    def __len__(self):
        return len(self.loader)

In [59]:
import torch  
from torch.utils.data import Dataset, DataLoader  
  
class NewsDataset(Dataset):  
    def __init__(
        self, 
        categories, 
        contents, 
        tokenizer,
        check_point_name,
    ):  
        self.tokenizer = tokenizer.pretrained_from(check_point_name) 
        self.categories = categories
        self.contents = contents   
  
    def __len__(self):  
        return len(self.categories)  
  
    def __getitem__(self, idx):  
        category = self.categories[idx]  
        content = self.contents[idx]  
  
        encoding = tokenizer(self.contents, return_tensors='pt') 
        input_ids = torch.tensor(encoding['input_ids'].flatten())  
        attention_mask = encoding['attention_mask'].flatten()  
        category = torch.tensor(category, dtype=torch.long)  
  
        return input_ids,  attention_mask, category  

In [23]:
import torch.nn as nn
class Bertmulticls(nn.Module):
    def __init__(self, BertModel, check_point_name, hidden_size1, hidden_size2,dropout_rate):
        super(Bertmulticls, self).__init__()  
        self.bert = BertModel
        self.linear1 = nn.Linear(768,hidden_size1)
        self.linear2 = nn.Linear(hidden_size1,hidden_size2)
        self.linear3 = nn.Linear(hidden_size2,1)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
    def forward(self,input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)  
        sequence_output = outputs.last_hidden_state  
        cls_output = sequence_output[:, 0, :]
        x = encoder_outputs.pooler_output
        x = self.relu(self.linear1(x))
        x = self.dropout1(x)
        x = self.relu(self.linear2(x))
        x = self.dropout2(x)
        x = self.linear3(x)
        return x

In [57]:
def train(model, criterion, optimizer, train_iter, epoches):
    model.train()
    for epoch in range(epoches):
        epoch_loss = 0.0
        for input_ids, attention_mask,label in train_iter:
            optimizer.zero_grad()
            cls = model(input_ids = input_ids,attention_mask = attention_mask)
            loss = criterion(cls,label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        average_loss = epoch_loss / len(train_iter)
        print(f"Epoch {epoch+1}/{epoches},Loss:{average_loss}")

In [16]:
check_point_name = '/Users/hanlinwang/Downloads/bert-base-chinese/' 
categories = train_data['label']
contents = train_data['content']
batch_size = 16
hidden_size1 = 256
hidden_size2 = 64
dropout_rate = 0.2

In [40]:
import torch.nn as nn  
from torch.nn import CrossEntropyLoss
from transformers import BertTokenizer, BertModel
from transformers import AdamW
tokenizer = BertTokenizer.from_pretrained(check_point_name)
BertModel = BertModel.from_pretrained(check_point_name,num_labels = 10)
dataset_train = NewsDataset(categories, contents, tokenizer)  
train_iter = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)  
model = Bertmulticls(BertModel=BertModel,
    check_point_name = check_point_name, 
    hidden_size1 = hidden_size1, 
    hidden_size2 = hidden_size2,
    dropout_rate = dropout_rate
)
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

In [60]:
train(BertModel, criterion, optimizer, train_iter, 5)

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [7]:
def read_category(y_train):
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [x for x in categories]
    cat_to_id = dict(zip(categories, range(len(categories))))
    label_id = []
    for i in range(len(y_train)):
        label_id.append(cat_to_id[y_train[i]])
    return label_id

train_target=train_data['label']  
train_data['label']=read_category(train_target)

In [62]:
train_data[0]

KeyError: 0

In [49]:
len(train_data['content'][0])

746

In [51]:
test = tokenizer(train_data['content'][0])

In [54]:
len(test.input_ids)

739