In [1]:
import os.path as osp
import sys
path = osp.dirname(osp.abspath(''))
sys.path.append(path)
sys.path.append(osp.join(path, "open_biomed"))
path

'/mnt/luoyz_dair/OpenBioMed'

BioMedGPT-10B is composed of a language model (BioMedGPT-LM-7B), a molecule encoder (GraphMVP), a protein encoder (ESM2-3B) and two modality adaptors. For inference, install `config.json`, `special_token_map.json`, `tokenizer_config.json`, `tokenizer.json`, `tokenizer.model` for BioMedGPT-LM-7B [here](https://huggingface.co/PharMolix/BioMedGPT-LM-7B) and put them under `OpenBioMed/ckpts/text_ckpts/biomedgpt-lm-7b`. Install `config.json`, `tokenizer_config.json`, `vocab.txt` for ESM2-3B [here](https://huggingface.co/facebook/esm2_t36_3B_UR50D) and put them under `OpenBioMed/ckpts/text_ckpts/biomedgpt-lm-7b`. Approximately 20GB GPU Memory is required to load BioMedGPT-10B with fp16.

In [2]:
# load biomedgpt model
import json
import torch
from open_biomed.utils import fix_path_in_config
from open_biomed.models.multimodal import BioMedGPTV

config = json.load(open("../configs/encoders/multimodal/biomedgptv.json", "r"))
fix_path_in_config(config, path)
print("Config: ", config)

device = torch.device("cuda:3")
config["network"]["device"] = device
model = BioMedGPTV(config["network"])
ckpt = torch.load("../ckpts/fusion_ckpts/biomedgpt_10b.pth")
model.load_state_dict(ckpt)
model = model.to(device)
model.eval()
print("Finish loading model")

  from .autonotebook import tqdm as notebook_tqdm


Config:  {'name': 'biomedgptv', 'data': {'mol': {'modality': ['structure'], 'featurizer': {'structure': {'name': 'BaseGNN'}}}, 'protein': {'modality': ['structure'], 'featurizer': {'structure': {'name': 'transformertok', 'transformer_type': 'esm', 'model_name_or_path': '/mnt/luoyz_dair/OpenBioMed/ckpts/protein_ckpts/esm2-3b', 'max_length': 1024}}}}, 'network': {'mol': {'gin_hidden_dim': 300, 'gin_num_layers': 5, 'drop_ratio': 0.0, 'max_n_nodes': 256, 'freeze': True}, 'protein': {'model_name_or_path': '/mnt/luoyz_dair/OpenBioMed/ckpts/protein_ckpts/esm2-3b', 'lora': False, 'freeze': True, 'use_float16': True}, 'llm': {'model_name_or_path': '/mnt/luoyz_dair/OpenBioMed/ckpts/text_ckpts/biomedgpt-lm-7b', 'use_float16': True}}}
Finish loading model


In [3]:
# chat
from open_biomed.utils.chat_utils import Conversation

prompt_sys = "You are working as an excellent assistant in chemistry and molecule discovery. " + \
             "Below a human expert gives the representation of a molecule or a protein. Answer questions about it. "
chat = Conversation(
    model=model, 
    processor_config=config["data"], 
    device=device,
    system=prompt_sys,
    roles=("Human", "Assistant"),
    sep="###",
    max_length=2048
)
chat.append_molecule("CC(=CCC1=CC(=CC(=C1O)CC=C(C)C)/C=C/C(=O)C2=C(C=C(C=C2)O)O)C")
question = "Please describe this molecule."
print("Human: ", question)
chat.ask(question)
print("Assistant: ", chat.answer()[0])

Human:  Please describe this molecule.
Assistant:  The molecule is a member of the class of flavonoids that is 3-hydroxyflavone substituted by a 3-hydroxy-4-methoxyphenyl group at position 7. It has a role as a plant metabolite. It is a member of flavonoids, a member of 3-hydroxyflavones and a member of 3-hydroxyflavones. It derives from a 3-hydroxyflavone.


In [9]:
chat.reset()
chat.append_protein("MNLNRFERYPLTFGPSPITPLKRLSQHLGGKVELYAKREDCNSGLAFGGNKTRKLEYLIPEAIEQGCDTLVSIGGIQSNQTRQVAAVAAHLGMKCVLVQENWVNYSDAVYDRVGNIEMSRIMGADVRLDAAGFDIGIRPSWEKAMSDVVEQGGKPFPIPAGCSEHPYGGLGFVGFAEEVRQQEKELGFKFDYIVVCSVTGSTQAGMVVGFAADGRSKNVIGIDASAKPEQTKAQILRIARHTAELVELGREITEEDVVLDTRFAYPEYGLPNEGTLEAIRLCGSLEGVLTDPVYEGKSMHGMIEMVRRGEFPEGSKVLYAHLGGAPALNAYSFLFRNG")
question = "What is the protein family of this protein?"
print("Human: ", question)
chat.ask(question)
print("Assistant: ", chat.answer()[0])

Human:  What is the protein family of this protein?
Assistant:  Belongs to the ACC deaminase/D-cysteine desulfhydrase family.


In [6]:
chat.reset()
question = "Please describe the EGFR pathway."
print("Human: ", question)
chat.ask(question)
print("Assistant: ", chat.answer()[0])

Human:  Please describe the EGFR pathway.
Assistant:  1. The EGFR pathway is a signaling pathway that is activated by the binding of the epidermal growth factor (EGF) to its receptor. 2. The EGFR pathway is involved in a variety of cellular processes, including cell proliferation, differentiation, and survival. 3. The EGFR pathway is often dysregulated in cancer, leading to uncontrolled cell growth and tumor formation. 4. The EGFR pathway is a complex network of interactions between multiple proteins, including EGFR, ErbB2, ErbB3, and ErbB4. 5. The EGFR pathway is regulated by a variety of molecules, including growth factors, cytokines, and hormones. 6. The EGFR pathway is involved in a variety of diseases, including cancer, inflammation, and neurodegenerative disorders. 7. The EGFR pathway is a promising target for the development of new cancer therapies. 8. The EGFR pathway is also involved in other diseases, such as inflammation
