In [1]:
from padiff import *
from padiff import create_model
from paddle.io import Dataset, BatchSampler, DataLoader
from onnx2pytorch import ConvertModel
import onnx
import torch
import paddle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def traverse_layers_paddle(layer):
    for sublayer in layer.sublayers():
        print(sublayer.full_name(), sublayer)

In [3]:
class SimpleModule(torch.nn.Module):
  def __init__(self):
    super(SimpleModule, self).__init__()
    self.linear1 = torch.nn.Linear(100, 10)
  def forward(self, x):
    x = self.linear1(x)
    return x

class SimpleLayer(paddle.nn.Layer):
  def __init__(self):
    super(SimpleLayer, self).__init__()
    self.linear1 = paddle.nn.Linear(100, 10)
    # self.linear2 = paddle.nn.Linear(10, 10)
  def forward(self, x):
    x = self.linear1(x)
    # x = self.linear2(x)
    return x

module = SimpleModule()
layer = SimpleLayer()

inp = paddle.rand((1, 100, 100)).numpy().astype("float32")
inp = ({'x': torch.as_tensor(inp) },
     {'x': paddle.to_tensor(inp)})

W0627 14:15:47.042629 27237 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.8, Runtime API Version: 11.7
W0627 14:15:47.043684 27237 gpu_resources.cc:149] device: 0, cuDNN Version: 8.6.


In [4]:
class RandomDataset(Dataset):
    def __init__(self, num_samples):
        self.num_samples = num_samples

    def __getitem__(self, idx):
        input = paddle.rand((100, 100))
        return input

    def __len__(self):
        return self.num_samples
    
dataset = RandomDataset(10)

loader = DataLoader(dataset,
                    batch_size=1,
                    shuffle=True,
                    drop_last=True,
                    num_workers=2)

loss_fn = paddle.mean

In [21]:
model = create_model(SimpleLayer())

for data in loader():
    output = model(data)
    loss = loss_fn(output)
    model.backward(loss)

    model.try_dump(per_step=10)  

In [20]:
model = create_model(SimpleLayer())
for i in range(100):
    output = model(inp[1]['x']) 
    model.try_dump(per_step=2) 

In [1]:
my_args = ["-c",
           "/root/paddlejob/workspace/work/taozewei/padiff/PaddleFleetX/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml",
           "-o",
           "Model.fused_softmax_with_triangular=false",
           "-o",
           "Model.fuse_attn_qkv=false"]

In [2]:
from padiff import *

import os
import sys
import copy

import paddle
from paddle.distributed import fleet
import paddle.distributed as dist

# __dir__ = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))

from ppfleetx.utils import config
from ppfleetx.utils.log import logger
from ppfleetx.data import build_dataloader
from ppfleetx.models import build_module
from ppfleetx.core import EagerEngine
from ppfleetx.distributed.apis import env

def set_default_flags(flags):
    for flag_name, flag_value in flags.items():
        if os.getenv(flag_name) is None:
            paddle.set_flags({flag_name: flag_value})

args = config.parse_args(my_args)
cfg = config.get_config(args.config, overrides=args.override, show=False)

paddle.set_device(cfg["Global"]["device"])
if dist.get_world_size() > 1:
    env.init_dist_env(cfg)

env.set_seed(cfg.Global.seed)

module = build_module(cfg)
config.print_config(cfg)

input = paddle.randint(0, 10000, (1, 1024))

my_345m_model = module.model
out = my_345m_model(input)

  from .autonotebook import tqdm as notebook_tqdm
[32m[2023-06-27 13:10:56,430] [INFO][0m - The global seed is set to 2049 and local seed is set to 2050.[0m
[32m[2023-06-27 13:10:56,431] [INFO][0m - Found gpt2-vocab.json in cache_dir: /root/.cache/ppfleetx/.[0m
[32m[2023-06-27 13:10:56,432] [INFO][0m - Found gpt2-merges.txt in cache_dir: /root/.cache/ppfleetx/.[0m
[32m[2023-06-27 13:10:56,433] [INFO][0m - loading vocabulary file http://fleet.bj.bcebos.com/datasets/gpt/gpt2-vocab.json from cache at /root/.cache/ppfleetx/gpt2-vocab.json[0m
[32m[2023-06-27 13:10:56,433] [INFO][0m - loading merges file http://fleet.bj.bcebos.com/datasets/gpt/gpt2-merges.txt from cache at /root/.cache/ppfleetx/gpt2-merges.txt[0m
[32m[2023-06-27 13:10:56,505] [INFO][0m - Model Size: 0.35 B[0m
W0627 13:10:56.510839 11400 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.8, Runtime API Version: 11.7
W0627 13:10:56.511888 11400 gpu_resources.cc:1

In [32]:
model = create_model(my_345m_model)

In [33]:
for i in range(10):
    out = model(input)
    out.backward()
    model.try_dump(per_step=10)

In [28]:
import collections
import json
import math
import os
import urllib.request
import re
import sys
import torch

def download(url, file=None):
    file = file if file else url.split('/')[-1]
    path = os.path.join(os.path.dirname("/root/paddlejob/workspace/work/taozewei/padiff/test.ipynb"), file)
    if not os.path.isfile(path):
        def reporthook(count, block_size, total_size):
            percent = str(int(100 * count * block_size / total_size)) + '%'
            print('\r\033[KDownloading ' + file + ' (' + percent + ')', end='', flush=True)
        urllib.request.urlretrieve(url, path, reporthook=reporthook)
        print('\r\033[K', end='', flush=True)
    return path

class BPETokenizer:

    def __init__(self):
        url = 'https://openaipublic.blob.core.windows.net/gpt-2/models/124M/'
        with open(download(url + 'encoder.json'), 'r', encoding='utf-8') as file:
            self.encoder = json.load(file)
        self.decoder = {v:k for k,v in self.encoder.items()}
        with open(download(url + 'vocab.bpe'), 'r', encoding='utf-8') as file:
            vocab = file.read().split('\n')[1:-1]
        self.bpe_ranks = {tuple(line.split()): i for i, line in enumerate(vocab)}
        assert len(self.encoder) == 50257 and len(self.bpe_ranks) == 50000
        bs = list(range(33, 127)) + list(range(161, 256))
        xs = list(range(0, 33)) + list(range(127, 161))
        cs = bs[:] + [2**8 + i for i in range(len(xs))]
        self.byte_encoder = dict(zip(bs + xs, [chr(n) for n in cs]))
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}

    def encode(self, text, allowed_special=None):
        tokens = re.findall(r"""<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d| ?""" +
                            r"""\w+| ?\d+| ?[^\s\w\d]+|\s+(?!\S)|\s+""", text, re.UNICODE)
        def translate(token):
            if token == '<|endoftext|>':
                assert allowed_special and token in allowed_special
                return [token]
            word = tuple(''.join(self.byte_encoder[byte] for byte in token.encode('utf-8')))
            while len(word) != 1:
                pairs = set((word[i], word[i+1]) for i in range(len(word)-1))
                bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
                if bigram not in self.bpe_ranks:
                    break
                a, b = bigram
                new_word = []
                i = 0
                while i < len(word):
                    j = word.index(a, i) if a in word[i:] else len(word)
                    new_word.extend(word[i:j])
                    i = j
                    if i < len(word):
                        j = 2 if i < len(word)-1 and word[i] == a and word[i+1] == b else 1
                        new_word.append(a+b if j == 2 else word[i])
                        i += j
                word = tuple(new_word)
            return word
        return [self.encoder[_] for token in tokens for _ in translate(token)]

    def decode(self, tokens):
        tokens = [self.decoder[token] for token in tokens]
        buffer = bytearray([self.byte_decoder[c] for c in ''.join(tokens)])
        return buffer.decode('utf-8', errors='replace')

class GPT2Config:

    def __init__(self, model_type):
        configs = {
            'gpt2':        [ 12, 12, 768  ], # 124M params
            'gpt2-medium': [ 24, 16, 1024 ], # 350M params
            'gpt2-large':  [ 36, 20, 1280 ], # 774M params
            'gpt2-xl':     [ 48, 25, 1600 ]  # 1558M params
        }
        self.type = model_type
        self.n_layer = configs[model_type][0]
        self.n_head = configs[model_type][1]
        self.n_embd = configs[model_type][2]
        self.vocab_size = 50257
        self.block_size = 1024
        self.url = 'https://huggingface.co/' + model_type + '/resolve/main/pytorch_model.bin'

class Attention(torch.nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_attn = torch.nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = torch.nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        size = config.block_size
        self.register_buffer('bias', torch.tril(torch.ones(size, size)).view(1, 1, size, size))

    def forward(self, x):
        B, T, C = x.size() # batch, context, embedding
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = torch.nn.functional.softmax(att, dim=-1)
        return self.c_proj((att @ v).transpose(1, 2).contiguous().view(B, T, C))

class Block(torch.nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = torch.nn.LayerNorm(config.n_embd)
        self.ln_2 = torch.nn.LayerNorm(config.n_embd)
        self.attn = Attention(config)
        self.mlp = torch.nn.Sequential(collections.OrderedDict([
            ('c_fc', torch.nn.Linear(config.n_embd, 4 * config.n_embd)),
            ('act', torch.nn.GELU(approximate='tanh')),
            ('c_proj', torch.nn.Linear(4 * config.n_embd, config.n_embd))
        ]))

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT2(torch.nn.Module):

    def __init__(self, model_type):
        super().__init__()
        config = GPT2Config(model_type)
        self.block_size = config.block_size
        self.transformer = torch.nn.ModuleDict(dict(
            wte = torch.nn.Embedding(config.vocab_size, config.n_embd),
            wpe = torch.nn.Embedding(config.block_size, config.n_embd),
            h = torch.nn.Sequential(*[Block(config) for _ in range(config.n_layer)]),
            ln_f = torch.nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = torch.nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        # state_file = download(config.url, config.type + '.bin')
        # state_dict = torch.load(state_file)
        # transposed = [ '.c_attn.weight', '.c_fc.weight', '.c_proj.weight' ]
        # for key, value in state_dict.items():
        #     if any(key.endswith(w) for w in transposed):
        #         state_dict[key] = value.t()
        # self.transformer.load_state_dict(state_dict)

    def forward(self, x: torch.Tensor, temperature: float = 0.1, top_k: int = 40):
        # x = torch.narrow(x, 1, 0, min(x.size(1), self.block_size))
        # pos = torch.arange(x.size()[1], dtype=torch.long, device=x.device).unsqueeze(0)
        # pos_embbeding = self.transformer.wpe(pos)
        x = self.transformer.wte(x)
        x = self.lm_head(self.transformer.ln_f(self.transformer.h(x)))
        return x
        logits = torch.select(x, dim=1, index=-1) / temperature
        min_top_k = torch.topk(logits, min(top_k, logits.size(-1))).values[:, [-1]]
        logits = torch.where(logits >= min_top_k, logits, -float('Inf'))
        probs = torch.nn.functional.softmax(logits, dim=-1)
        return torch.multinomial(probs, num_samples=1)

In [29]:
torch_345m_model = GPT2("gpt2-medium").cuda()

In [30]:
input = torch.randint(0, 10000, (1, 1024)).cuda()

In [32]:
torch_345m_model(input).shape

torch.Size([1, 1024, 50257])

In [22]:
class base:
    def __init__(self) -> None:
        self.a = 1
        self.b = 2

In [23]:
class son1(base):
    pass

class son2(base):
    pass

In [25]:
a = son1()

In [26]:
a.a = 2

In [27]:
b = son2()

In [29]:
b.a

1