## 代码的测试

### tensorboard demo       

In [1]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np

# 创建一个 SummaryWriter 实例
writer = SummaryWriter('runs/experiment_1')

# 模拟训练过程
for epoch in range(100):
    # 假设我们在每个 epoch 计算了一个 loss
    loss = np.random.random()
    
    # 将 loss 写入 TensorBoard
    writer.add_scalar('Loss/train', loss, epoch)

# 关闭 writer
writer.close()

### masking_function

In [7]:
import numpy as np
from typing import List, Tuple, Union
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]

def masking_function(
        text: str,
        tokenizer: TokenizerType,
        mask_prob: float,
        random_replace_prob: float,
        unmask_replace_prob: float,
        max_length: int,
) -> Tuple[List[int], List[int]]:
    tokenized_ids = ([tokenizer.bos_token_id] +
                     tokenizer.encode(text,
                                      add_special_tokens=False,
                                      truncation=True,
                                      max_length=max_length - 2) +
                     [tokenizer.eos_token_id])
    seq_len = len(tokenized_ids)
    tokenized_ids = np.array(tokenized_ids)
    subword_mask = np.full(len(tokenized_ids), False)

    # Masking the BOS and EOS token leads to slightly worse performance
    low = 1
    high = len(subword_mask) - 1
    mask_choices = np.arange(low, high)
    num_subwords_to_mask = max(
        int((mask_prob * (high - low)) + np.random.rand()), 1)
    subword_mask[np.random.choice(mask_choices,
                                  num_subwords_to_mask,
                                  replace=False)] = True

    # Create the labels first
    labels = np.full(seq_len, tokenizer.pad_token_id)
    labels[subword_mask] = tokenized_ids[subword_mask]

    tokenized_ids[subword_mask] = tokenizer.mask_token_id

    # Now of the masked tokens, choose how many to replace with random and how many to unmask
    rand_or_unmask_prob = random_replace_prob + unmask_replace_prob
    if rand_or_unmask_prob > 0:
        rand_or_unmask = subword_mask & (np.random.rand(len(tokenized_ids)) <
                                         rand_or_unmask_prob)
        if random_replace_prob == 0:
            unmask = rand_or_unmask
            rand_mask = None
        elif unmask_replace_prob == 0:
            unmask = None
            rand_mask = rand_or_unmask
        else:
            unmask_prob = unmask_replace_prob / rand_or_unmask_prob
            decision = np.random.rand(len(tokenized_ids)) < unmask_prob
            unmask = rand_or_unmask & decision
            rand_mask = rand_or_unmask & (~decision)
        if unmask is not None:
            tokenized_ids[unmask] = labels[unmask]
        if rand_mask is not None:
            weights = np.ones(tokenizer.vocab_size)
            weights[tokenizer.all_special_ids] = 0
            probs = weights / weights.sum()
            num_rand = rand_mask.sum()
            tokenized_ids[rand_mask] = np.random.choice(tokenizer.vocab_size,
                                                        num_rand,
                                                        p=probs)
    return tokenized_ids.tolist(), labels.tolist()

In [8]:
from transformers import AutoTokenizer
import numpy as np
from typing import List, Tuple

# 首先导入masking_function
# 假设masking_function已经定义在当前文件中

def demo_masking_function():
    # 初始化tokenizer
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    
    # 设置参数
    text = "The quick brown fox jumps over the lazy dog."
    mask_prob = 0.15
    random_replace_prob = 0.1
    unmask_replace_prob = 0.1
    max_length = 20
    
    # 调用masking_function
    masked_tokens, labels = masking_function(
        text=text,
        tokenizer=tokenizer,
        mask_prob=mask_prob,
        random_replace_prob=random_replace_prob,
        unmask_replace_prob=unmask_replace_prob,
        max_length=max_length
    )
    
    # 打印结果
    print("原始文本:")
    print(text)
    print("\n掩码后的token IDs:")
    print(masked_tokens)
    print("\n对应的label IDs:")
    print(labels)
    
    # 解码masked_tokens,以便查看实际的掩码效果
    decoded_masked = tokenizer.decode(masked_tokens)
    print("\n解码后的掩码文本:")
    print(decoded_masked)
    
    # 显示哪些位置被掩码了
    mask_positions = [i for i, (m, l) in enumerate(zip(masked_tokens, labels)) if m != l]
    print("\n被掩码的位置:")
    print(mask_positions)

if __name__ == "__main__":
    np.random.seed(42)  # 为了结果可复现
    demo_masking_function()

Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.73MB/s]
Downloading: 100%|██████████| 25.0/25.0 [00:00<00:00, 205kB/s]

原始文本:
The quick brown fox jumps over the lazy dog.

掩码后的token IDs:
[0, 133, 2119, 6219, 50264, 13855, 81, 5, 22414, 2335, 4, 2]

对应的label IDs:
[1, 1, 1, 1, 23602, 1, 1, 1, 1, 1, 1, 1]

解码后的掩码文本:
<s>The quick brown<mask> jumps over the lazy dog.</s>

被掩码的位置:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]





## 训练报错及解决方法

### error: write() argument must be str, not RunningCommand

```python
try:
    gitlog = sh.git.log("-1", format="%H", _tty_out=False, _fg=False)
    with (exp_dir / "githash.log").open("w") as handle:
        handle.write(str(gitlog))
except sh.ErrorReturnCode_128:
    logger.info("Seems like the code is not running from"
                " within a git repo, so hash will"
                " not be stored. However, it"
                " is strongly advised to use"
                " version control.")

### 下载Dataset出现错误

应该是网络问题，出现"ConnectionError: Couldn't reach https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip"报错

解决方法:  
目前只能从官网下载数据集，然后加载。[wikitext-2-v1](https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-2-v1)  
当然官网也有hugginface也有说明，数据是存储在git lfs上的，现将模型跑通，后面补充学习git lfs的知识

In [2]:
# 测试下读取
from datasets import load_dataset

wikitext_dataset = load_dataset("parquet", 
                                data_files={"train": "data/train-00000-of-00001.parquet"},
                                split="train")

print(wikitext_dataset)

Using custom data configuration default-f0e3d46d0fd4a82d
Reusing dataset parquet (/Users/jiatianyu/.cache/huggingface/datasets/parquet/default-f0e3d46d0fd4a82d/0.0.0/9296ce43568b20d72ff8ff8ecbc821a16b68e9b8b7058805ef11f06e035f911a)


Dataset({
    features: ['text'],
    num_rows: 36718
})


### package包的问题

错误信息:packaging.version.InvalidVersion: Invalid version: '0.10.1,<0.11'  

解决方法:
```shell
pip install packaging==21.3
```

## 训练效果对比

设备上3090，单卡，常规使用GPU训练每3s大约10个epoch，通过deepspeed加速后可以达到每s大约10个epoch，速度提升明显，当然batchsize设置的不太合理，因为文本比较小  

<img src="./images/训练效果.png" alt="Broadcast" style="zoom:60%;"> 