## BioGPT: Biomedical Text vs Code Generation

In [11]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# Set path and load model
model_path = "/mnt/liuzq/modelscope_cache/models/xw-download/AIModel/microsoft/biogpt"
snapshots_path = os.path.join(model_path, "snapshots")
snapshot_dir = os.listdir(snapshots_path)[0]
real_model_path = os.path.join(snapshots_path, snapshot_dir)

In [3]:
# Load model
tokenizer = AutoTokenizer.from_pretrained(real_model_path)
model = AutoModelForCausalLM.from_pretrained(real_model_path)

In [4]:
# Test biomedical question
prompt1 = "COVID-19 is"
inputs1 = tokenizer(prompt1, return_tensors="pt")
outputs1 = model.generate(**inputs1, max_length=10000, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
result1 = tokenizer.decode(outputs1[0], skip_special_tokens=True)
print(f"Prompt: {prompt1}")
print(f"Output: {result1}\n")

Prompt: COVID-19 is
Output: COVID-19 is a global pandemic.



In [8]:
# Test code generation
prompt2 = "Write R code for differential expression analysis using limma package"
inputs2 = tokenizer(prompt2, return_tensors="pt")
outputs2 = model.generate(**inputs2, max_length=10000, num_return_sequences=1, do_sample=True, top_p=0.95, temperature=0.8, pad_token_id=tokenizer.eos_token_id)
result2 = tokenizer.decode(outputs2[0], skip_special_tokens=True)
print(f"Prompt: {prompt2}")
print(f"Output: {result2}\n")

Prompt: Write R code for differential expression analysis using limma package
Output: Write R code for differential expression analysis using limma package.



## BioMistral: Biomedical Text vs Code Generation

In [12]:
# Set model path
model_path = "/mnt/liuzq/modelscope_cache/models/xw-download/AIModel/BioMistral/BioMistral-7B/snapshots/9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5/"

In [13]:
# Load model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)

In [15]:
# Test biomedical question
prompt1 = "COVID-19 is"
inputs1 = tokenizer(prompt1, return_tensors="pt").to(model.device)
outputs1 = model.generate(**inputs1, max_length=10000, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
result1 = tokenizer.decode(outputs1[0], skip_special_tokens=True)
print(f"Prompt: {prompt1}")
print(f"Output: {result1}\n")

Prompt: COVID-19 is
Output: COVID-19 is a novel coronavirus that was first identified in December 2019 in Wuhan, China. It is a highly infectious disease that has spread rapidly around the world. The World Health Organization (WHO) declared COVID-19 a pandemic on 11 March 2020. As of 15 June 2020, there have been 7,800,000 confirmed cases and 430,000 deaths worldwide .



In [16]:
# Test code generation
prompt2 = "Write R code for differential expression analysis using limma package"
inputs2 = tokenizer(prompt2, return_tensors="pt").to(model.device)
outputs2 = model.generate(**inputs2, max_length=10000, num_return_sequences=1, do_sample=True, top_p=0.95, temperature=0.8, pad_token_id=tokenizer.eos_token_id)
result2 = tokenizer.decode(outputs2[0], skip_special_tokens=True)
print(f"Prompt: {prompt2}")
print(f"Output: {result2}\n")

Prompt: Write R code for differential expression analysis using limma package
Output: Write R code for differential expression analysis using limma package. Note the difference between the limma design matrix in this example and the design matrixes in examples 2 and 3. In example 2 and 3, a column of the design matrix was used to indicate the factor of interest for each sample, while in this example, a row of the design matrix is used to indicate the factor of interest for each gene.



## BioMedLM: Biomedical Text vs Code Generation

In [17]:
model_path = "/mnt/liuzq/modelscope_cache/models/xw-download/AIModel/stanford-crfm/-BioMedLM/snapshots/3e1a0abb814b8398bc34b4b6680ecf2c26d6a66f/"

In [18]:
# Load model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [20]:
# Test biomedical question
prompt1 = "COVID-19 is"
inputs1 = tokenizer(prompt1, return_tensors="pt")
outputs1 = model.generate(**inputs1, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
result1 = tokenizer.decode(outputs1[0], skip_special_tokens=True)
print(f"Prompt: {prompt1}")
print(f"Output: {result1}\n")

Prompt: COVID-19 is
Output: COVID-19 is a novel coronavirus, the virus is a major cause of the disease.

The authors declare no competing interests.

Author contributions {#s0010}



In [21]:
# Test code generation
prompt2 = "Write R code for differential expression analysis using limma package"
inputs2 = tokenizer(prompt2, return_tensors="pt")
outputs2 = model.generate(**inputs2, max_length=100, num_return_sequences=1, do_sample=True, top_p=0.95, temperature=0.8, pad_token_id=tokenizer.eos_token_id)
result2 = tokenizer.decode(outputs2[0], skip_special_tokens=True)
print(f"Prompt: {prompt2}")
print(f"Output: {result2}\n")

Prompt: Write R code for differential expression analysis using limma package
Output: Write R code for differential expression analysis using limma package (see the [Supplementary Materials and methods. A total of 25,432 of the 25,432 (16.9%) of the 144 total genes were shared between the two datasets (see [Figure 4---figure supplement 4](#SD1){ref-type="supplementary-material"}.

              

