In [None]:
import sys
import os
import numpy as np
import tqdm

import mmap
import re
from argparse import ArgumentParser

import transformers
from accelerate import init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import torch
from sfm.models.scigpt.scigpt import ScigptModel
from sfm.models.scigpt.config import ScigptConfig
from sfm.utils import arg_utils
from argparse import ArgumentParser

import multiprocessing as mp
from sfm.utils.science_tokens import SCIENCE_TAG_TOKENS, SCIENCE_TOKENS

from sfm.logging import logger

import struct
from multiprocessing import Lock


In [None]:

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def get_args_and_tokenizer(use_llama=False):
    parser = ArgumentParser()
    cfg_classes = [ScigptConfig]
    parser = arg_utils.add_dataclass_to_parser(cfg_classes, parser)
    args = parser.parse_args(args=[])
    args.load_ckpt = False
    args.strategy = "DDP"
    args.encoder_layers = 33
    args.encoder_embed_dim = 1280
    args.encoder_ffn_embed_dim = 5120
    args.encoder_attention_heads = 20
    args.infer = True
    args.bf16 = True

    tokenizer = AutoTokenizer.from_pretrained("/home/v-zekunguo/hai1data/llama/Meta-Llama-3-8B/original")
    args.save_dir = "/home/v-zekunguo/hai1data/nlm/output/llama3_stageB/global_step1600/"
    args.save_dir = "/home/v-zekunguo/hai1data/llama/Meta-Llama-3-8B/original"
    args.llm_model_name_or_path = "/home/v-zekunguo/hai1data/llama/Meta-Llama-3-8B/original"

    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    # special_tokens_dict["additional_special_tokens"] = SCIENCE_TAG_TOKENS
    tokenizer.add_special_tokens(special_tokens_dict)
    tokenizer.tag_re = re.compile(f'{"|".join(SCIENCE_TAG_TOKENS)}')
    tokenizer.smiles_re = re.compile(
        "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    )

    tokenizer.add_special_tokens(
        {
            "pad_token": "[PAD]",
            "unk_token":"<unk>",
        },

    )

    tokenizer.add_tokens(SCIENCE_TAG_TOKENS)
    tokenizer.add_tokens(SCIENCE_TOKENS)
    extra_tokens = []
    # protein
    for i in range(26):
        extra_tokens.append(f"<a>{chr(65 + i)}")

    # DNA, RNA, including ambiguous bases
    for c in "ACTGURYSWKMBDHVN":
        extra_tokens.append(f"<d>{c}")
        extra_tokens.append(f"<r>{c}")

    # materials, non-elements
    for c in "0123456789()+-":
        extra_tokens.append(f"<i>{c}")
    for i in range(26):
        extra_tokens.append(f"<i>{chr(65 + i)}")
        extra_tokens.append(f"<i>{chr(97 + i)}")

    tokenizer.add_tokens(extra_tokens)
    tokenizer.split_special_tokens = True  # Ensure _tokenize() can access special tokens

    logger.info(f"Tokenizer has {len(tokenizer)} tokens")

    args.vocab_size=len(tokenizer)

    return args, tokenizer

args, tokenizer = get_args_and_tokenizer()
print(type(tokenizer))

In [3]:
args.vocab_size=130304

In [4]:
# Loading the extended trained model
ckpt_dict = {}

model = ScigptModel(args)
model.decoder.resize_token_embeddings(args.vocab_size)
model_dict = model.state_dict()
print(f"model_dict: {model_dict.keys()}")
print(model_dict['decoder.model.layers.0.mlp.gate_proj.weight'].shape)
print(model_dict['decoder.model.layers.0.mlp.up_proj.weight'].shape)
weight1_size=model_dict['decoder.model.layers.0.mlp.gate_proj.weight'].size(0)
weight2_size=model_dict['decoder.model.layers.0.mlp.up_proj.weight'].size(0)
layer0 = torch.load(os.path.join(args.save_dir, "layer_00-model_00-model_states.pt"), map_location=torch.device("cpu"))
for k, v in layer0.items():
    if k=='word_embeddings.weight':
        ckpt_dict['decoder.model.embed_tokens.weight'] = v

for l in range(0, 32):
    l_index = str(l + 1).zfill(2)
    layer = torch.load(os.path.join(args.save_dir, f"layer_{l_index}-model_00-model_states.pt"), map_location=torch.device("cpu"))
    for k in layer:
        if "dummy" in k or 'rotary_emb' in k:
            continue
        if k=="self_attention.layernorm_qkv.layer_norm_weight":
            ckpt_dict[f"decoder.model.layers.{l}.input_layernorm.weight"] = layer[k]
        elif k=='self_attention.layernorm_qkv.query_weight':
            ckpt_dict[f"decoder.model.layers.{l}.self_attn.q_proj.weight"] = layer[k]
        elif k=='self_attention.layernorm_qkv.key_weight':
            ckpt_dict[f"decoder.model.layers.{l}.self_attn.k_proj.weight"] = layer[k]
        elif k=='self_attention.layernorm_qkv.value_weight':
            ckpt_dict[f"decoder.model.layers.{l}.self_attn.v_proj.weight"] = layer[k]
        elif k=='self_attention.proj.weight':
            ckpt_dict[f"decoder.model.layers.{l}.self_attn.o_proj.weight"] = layer[k]
        elif k=='layernorm_mlp.layer_norm_weight':
            ckpt_dict[f"decoder.model.layers.{l}.post_attention_layernorm.weight"] = layer[k]
        elif k=='layernorm_mlp.fc1_weight':
            weight1,weight2=torch.split(layer[k], [weight1_size, weight2_size], dim=0)
            ckpt_dict[f"decoder.model.layers.{l}.mlp.gate_proj.weight"] = weight1
            ckpt_dict[f"decoder.model.layers.{l}.mlp.up_proj.weight"] = weight2
        elif k=='layernorm_mlp.fc2_weight':
            ckpt_dict[f"decoder.model.layers.{l}.mlp.down_proj.weight"] = layer[k]
    del layer

layer = torch.load(os.path.join(args.save_dir, "layer_33-model_00-model_states.pt"), map_location=torch.device("cpu"))
ckpt_dict["decoder.model.norm.weight"] = layer["norm.weight"]

layer = torch.load(os.path.join(args.save_dir, "layer_34-model_00-model_states.pt"), map_location=torch.device("cpu"))
ckpt_dict["decoder.lm_head.weight"] = layer["lm_head.weight"]

print(f"ckpt_dict: {ckpt_dict.keys()}")
model_dict.update(ckpt_dict)
model.load_state_dict(model_dict)



model_dict: odict_keys(['decoder.model.embed_tokens.weight', 'decoder.model.layers.0.self_attn.q_proj.weight', 'decoder.model.layers.0.self_attn.k_proj.weight', 'decoder.model.layers.0.self_attn.v_proj.weight', 'decoder.model.layers.0.self_attn.o_proj.weight', 'decoder.model.layers.0.mlp.gate_proj.weight', 'decoder.model.layers.0.mlp.up_proj.weight', 'decoder.model.layers.0.mlp.down_proj.weight', 'decoder.model.layers.0.input_layernorm.weight', 'decoder.model.layers.0.post_attention_layernorm.weight', 'decoder.model.layers.1.self_attn.q_proj.weight', 'decoder.model.layers.1.self_attn.k_proj.weight', 'decoder.model.layers.1.self_attn.v_proj.weight', 'decoder.model.layers.1.self_attn.o_proj.weight', 'decoder.model.layers.1.mlp.gate_proj.weight', 'decoder.model.layers.1.mlp.up_proj.weight', 'decoder.model.layers.1.mlp.down_proj.weight', 'decoder.model.layers.1.input_layernorm.weight', 'decoder.model.layers.1.post_attention_layernorm.weight', 'decoder.model.layers.2.self_attn.q_proj.weight

<All keys matched successfully>

In [3]:
ckpt_dict = {}
# Load the original llama3 model
model = ScigptModel(args)

model_dict = model.state_dict()
print(f"model_dict: {model_dict.keys()}")

layer0 = torch.load(os.path.join(args.save_dir, "layer_00-model_states.pt"), map_location=torch.device("cpu"))
for k, v in layer0.items():
    new_k = "decoder.model." + k
    ckpt_dict[new_k] = v

for l in range(0, 32):
    l_index = str(l + 1).zfill(2)
    layer = torch.load(os.path.join(args.save_dir, f"layer_{l_index}-model_states.pt"), map_location=torch.device("cpu"))
    for k in layer:
        if "dummy" in k or 'rotary_emb' in k:
            continue
        ckpt_dict[f"decoder.model.layers.{l}.{k}"] = layer[k]
    del layer

layer = torch.load(os.path.join(args.save_dir, "layer_33-model_states.pt"), map_location=torch.device("cpu"))
ckpt_dict["decoder.model.norm.weight"] = layer["norm.weight"]

layer = torch.load(os.path.join(args.save_dir, "layer_34-model_states.pt"), map_location=torch.device("cpu"))
ckpt_dict["decoder.lm_head.weight"] = layer["lm_head.weight"]

print(f"ckpt_dict: {ckpt_dict.keys()}")
model_dict.update(ckpt_dict)
model.load_state_dict(model_dict)



model_dict: odict_keys(['decoder.model.embed_tokens.weight', 'decoder.model.layers.0.self_attn.q_proj.weight', 'decoder.model.layers.0.self_attn.k_proj.weight', 'decoder.model.layers.0.self_attn.v_proj.weight', 'decoder.model.layers.0.self_attn.o_proj.weight', 'decoder.model.layers.0.mlp.gate_proj.weight', 'decoder.model.layers.0.mlp.up_proj.weight', 'decoder.model.layers.0.mlp.down_proj.weight', 'decoder.model.layers.0.input_layernorm.weight', 'decoder.model.layers.0.post_attention_layernorm.weight', 'decoder.model.layers.1.self_attn.q_proj.weight', 'decoder.model.layers.1.self_attn.k_proj.weight', 'decoder.model.layers.1.self_attn.v_proj.weight', 'decoder.model.layers.1.self_attn.o_proj.weight', 'decoder.model.layers.1.mlp.gate_proj.weight', 'decoder.model.layers.1.mlp.up_proj.weight', 'decoder.model.layers.1.mlp.down_proj.weight', 'decoder.model.layers.1.input_layernorm.weight', 'decoder.model.layers.1.post_attention_layernorm.weight', 'decoder.model.layers.2.self_attn.q_proj.weight

<All keys matched successfully>

In [None]:
device = torch.device("cuda")

model = model.to(torch.bfloat16).to(device)

model.eval()


In [6]:
import lmdb
from sfm.data.prot_data.util import bstr2obj
# load data
file_path='/home/v-zekunguo/hai1data/nlm/valid_lmdb/valid.patent.v2.txt.lmdb'
env = lmdb.open(
    file_path, subdir=True, readonly=True, lock=False, readahead=False
)
txn = env.begin(write=False)

print(env.stat())
count=0
metadata = bstr2obj(txn.get("metadata".encode()))
cur_len, cur_keys = metadata["size"], metadata["keys"]
print(cur_len)

{'psize': 4096, 'depth': 2, 'branch_pages': 1, 'leaf_pages': 38, 'overflow_pages': 33698, 'entries': 3745}
3744


In [None]:
import numpy as np
# Calculate loss
print(metadata.keys())
loss_list=[]
print(metadata['processed_seq_len'])
for key in cur_keys:
    value = txn.get(str(key).encode())
    input_ids = np.frombuffer(value, dtype=np.uint32)
    input_tensor = torch.from_numpy(input_ids.astype(np.int64)).unsqueeze(0).to(device)
    labels = input_tensor.clone()
    out = model.decoder(input_tensor, labels=labels)
    input_tensor.to("cpu")
    labels.to("cpu")
    print(out.loss.cpu().item())
    loss_list.append(out.loss.cpu().item())
    out = None
    torch.cuda.empty_cache()

    del out
print(sum(loss_list) / len(loss_list))

In [24]:
tokenizer.decode(input_ids)

'<|begin_of_text|>Funny you should ask. I was just looking at an article on a "Frozen" clear coat. Gives a matte-like breakup of reflections - and ease the eyestrain on the "Earl Scheib". Maybe the paint supply store could help?\\nGene--Funny you should ask! Just yesterday I got my car back with the brand new repaired & painted front bumper. Same problem. The color matches, but the bumper is way too shiny with the clear coat they had to put on it. I am wondering the same thing--how to dull it a little to match the rest of my old paint job.\\nThere are white Scotchbrite pads that might be able to take the gloss down without scratching. I think they are available at HD and Lowes.\\nIf that won\'t work maybe do the whole car w/ 0000 steel wool?\\nI just got done rubbing my bumper out. It looks pretty good in that it now doesn\'t look out of place on my car.\\nBe gentle. Practice on a non-visible area like under the nose first. Don\'t use a heavy hand or rub too long. Just enough to bust t

In [27]:
tokenizer.decode(input_tensor[0])

'<|begin_of_text|>Funny you should ask. I was just looking at an article on a "Frozen" clear coat. Gives a matte-like breakup of reflections - and ease the eyestrain on the "Earl Scheib". Maybe the paint supply store could help?\\nGene--Funny you should ask! Just yesterday I got my'

In [14]:
tokenizer.encode("Football is a ", return_tensors="pt")
input_tensor

tensor([[128000,  99886,    347,   3904,   8105,     12,   1187,     13,    578,
          24549,    315,    904,    832,    315,  30227,    347,  12843,   8105,
             12,     16]])

In [26]:
output = model.decoder.generate(
    input_ids=torch.tensor(input_tensor).to(device),
    num_beams=5,
    max_new_tokens=512,
    num_return_sequences=1,
    return_dict_in_generate=True,
    output_scores=True,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.5,
)
res = tokenizer.decode(output.sequences[0], skip_special_tokens=False)
print(res)

  input_ids=torch.tensor(input_tensor).to(device),
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Funny you should ask. I was just looking at an article on a "Frozen" clear coat. Gives a matte-like breakup of reflections - and ease the eyestrain on the "Earl Scheib". Maybe the paint supply store could help?\nGene--Funny you should ask! Just yesterday I got my hands on a sample of the new Dupli-Color Satin Clear Coat. It\'s a 2:1 mix of satin clear and regular clear, and it\'s designed to be used as a topcoat over Dupli-Color\'s line of satin colors. I haven\'t tried it yet, so I can\'t tell you how well it works, but it looks pretty good. It\'s available at most auto parts stores.\nI\'ve also heard good things about PPG\'s Ditzler line of flat clear coats, but I don\'t know if they\'re available anywhere other than PPG dealerships.\nHope this helps!\nMike
Funny you should ask. I was just looking at an article on a "Frozen" clear coat. Gives a matte-like breakup of reflections - and ease the eyestrain on the "Earl Scheib". Maybe the paint supply store could help?\nG

In [11]:
input_ids=torch.tensor(tokenizer.encode("Football is a ", return_tensors="pt")).to(device)
labels = input_ids.clone()
out = model.decoder(input_ids,labels=labels)

  input_ids=torch.tensor(tokenizer.encode("Football is a ", return_tensors="pt")).to(device)


In [7]:
output = model.decoder.generate(
    input_ids=torch.tensor(tokenizer.encode("Football is a ", return_tensors="pt")).to(device),
    num_beams=5,
    max_new_tokens=512,
    num_return_sequences=1,
    return_dict_in_generate=True,
    output_scores=True,
    do_sample=True,
    top_p=0.95,
    repetition_penalty=1.5,
)
res = tokenizer.decode(output.sequences[0], skip_special_tokens=False)
print(res)

  input_ids=torch.tensor(tokenizer.encode("Football is a ", return_tensors="pt")).to(device),
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Football is a 24/7 sport, and you can't just turn it off," he said. "It's not like baseball or basketball where you can take a few days off and then come back to it. You've got to be ready to go all the time. It's a completely different mindset."\nHe added: "I feel like I'm in the best shape I've ever been in. I'm stronger, faster, quicker and more explosive than I've ever been. I'm looking forward to getting out there and showing what I can do."\nAs for his future with the Red Sox, Pedroia said: "I don't know what's going to happen. I don't know what's going to happen. I don't know what's going to happen next year. I don't know what's going to happen the year after that. I don't know what's going to happen the year after that. I don't know what's going to happen the year after that. I don't know what's going to happen the year after that. I don't know what's going to happen the year after that. I don't know what's going to happen the year after that. I don't know what's going to happe

In [None]:
out