In [2]:
import json
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from repeng import ControlVector, ControlModel, DatasetEntry

model_name = "BioMistral/BioMistral-7B-DARE-AWQ-QGS128-W4-GEMM"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
#  wrap it in a ControlModel for later.
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


generate the honest v.s. untruthful dataset

In [3]:
facts = pd.read_csv('facts_true_false.csv')
suffixes = facts.statement.values
len(suffixes)

612

In [4]:
# the control vector we're going to make is honest / untruthful, like the paper
positive_personas = ["honest"]
negative_personas = ["untruthful"]
def template(persona: str, suffix: str) -> str:
    return f"{user_tag} Pretend you're an {persona} person making statements about the world. {asst_tag} {suffix}"

dataset = []
for suffix in suffixes:
    tokens = tokenizer.tokenize(suffix)
    # we augment our short suffix list by taking lots of different truncations.
    # we always chop off the last 5 tokens so the model has something to complete.
    for i in range(1, len(tokens) - 5):
        truncated = tokenizer.convert_tokens_to_string(tokens[:i])
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            dataset.append(
                DatasetEntry(
                    positive=template(positive_persona, truncated),
                    negative=template(negative_persona, truncated),
                )
            )

In [5]:
# the first three entries in the dataset
for i in range(3):
    print(f"dataset[{i}].positive:", dataset[i].positive)
    print(f"dataset[{i}].negative:", dataset[i].negative)

dataset[0].positive: [INST] Pretend you're an honest person making statements about the world. [/INST] The
dataset[0].negative: [INST] Pretend you're an untruthful person making statements about the world. [/INST] The
dataset[1].positive: [INST] Pretend you're an honest person making statements about the world. [/INST] The Earth
dataset[1].negative: [INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth
dataset[2].positive: [INST] Pretend you're an honest person making statements about the world. [/INST] The Earth'
dataset[2].negative: [INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth'


In [6]:
model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
)


# the first bar is for the forward pass, where we collect the hidden states
# the second is fitting the layer PCAs against those hidden states

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 276/276 [03:18<00:00,  1.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:17<00:00,  1.82it/s]


In [7]:
# this question is taken from the paper
input = f"{user_tag} You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead? {asst_tag}"

# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
    "pad_token_id": tokenizer.eos_token_id, # silence warning
    "do_sample": False, # temperature=0
    "max_new_tokens": 128,
    "repetition_penalty": 1.1, # reduce control jank
}

print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 2)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to match the positive effect
model.set_control(control_vector, -2)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

==baseline
<s> [INST] You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead? [/INST] I apologize for being late today. I had a family emergency that I needed to attend to last night, and it took longer than expected. I understand the importance of punctuality, and I assure you that this was an unforeseen situation. I will make sure to leave earlier than usual tomorrow to avoid any further delays. Thank you for understanding.</s>

++control
<s> [INST] You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead? [/INST] I would first and foremost express my sincere apologies for being late to the world of world. I would also explain that I have been honest and honest in the world of world, and that I am committed to doing the best we can do. I would also express my commitment to completing the tasks that need to be completed, 

In [21]:
model.set_control(control_vector, -1.5)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

<s> [INST] You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead? [/INST] I would say that the reason I am late is because of a traffic jam on my way here. This is a common excuse and it is not necessary to elaborate further. If the weather was bad or if I had to walk, I would have said that I left early, so this excuse works even if you were actually partying.</s>


In [22]:
print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1.5)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))


++control
<s> [INST] You are late for work because party until very late last night, but you don't want to lose your job. What would you tell your boss instead? [/INST] I would first and foremost apologize for being late to work. I would then explain that I had a difficult situation at home that required my attention before the start of the day. I would also express my commitment to completing all of my responsibilities in a timely manner and making sure that my actions do not cause any further delays in the future.</s>


---

In [24]:
with open("data/all_truncated_outputs.json") as f:
    output_suffixes = json.load(f)
truncated_output_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes)
    for i in range(1, len(tokens))
]
truncated_output_suffixes_512 = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes[:512])
    for i in range(1, len(tokens))
]

In [25]:
def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
) -> list[DatasetEntry]:
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{user_tag} {positive_template} {asst_tag} {suffix}",
                    negative=f"{user_tag} {negative_template} {asst_tag} {suffix}",
                )
            )
    return dataset

In [26]:
lazy_dataset = make_dataset(
    "Act as if you're extremely {persona}.",
    ["lazy, giving bare-minimum short responses on a task"],
    ["hardworking, going above and beyond on a task"],
    truncated_output_suffixes,
)
model.reset()
lazy_vector = ControlVector.train(model, tokenizer, lazy_dataset)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [00:52<00:00,  1.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:06<00:00,  4.72it/s]


In [27]:
def generate_with_vector(
    input: str,
    vector: ControlVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.1,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    if user_tag not in input:
        input = f"{user_tag} {input.strip()} {asst_tag}"
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    
    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    
    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    model.reset()

In [28]:
generate_with_vector(
    "How do I reverse a list in Python? Give me a code block.",
    lazy_vector,
    (2.0, -1.7),
    max_new_tokens=256,
)

==baseline ---------------------------------------------------
<s> [INST] How do I reverse a list in Python? Give me a code block. [/INST] You can use the built-in `reverse()` method to reverse a list in Python:
```python
my_list = [1, 2, 3, 4, 5]
my_list.reverse()
print(my_list)  # Output: [5, 4, 3, 2, 1]
```</s>

++control ---------------------------------------------------
<s> [INST] How do I reverse a list in Python? Give me a code block. [/INST] You can use the built-in `reverse` method to reverse a list. Here's an example:

```python
my_list = [1, 2, 3, 4, 5]

# Use the reverse method to reverse the list
my_list.reverse()

# List doesn't need to be empty, but it can be if you don't want to keep any of the original list items.
done.</s>

--control ---------------------------------------------------
<s> [INST] How do I reverse a list in Python? Give me a code block. [/INST] You can reverse a list in Python by using the `reverse` method of the list, or by using the `extensible and h

In [31]:
lazy_vector

ControlVector(model_type='mistral', directions={31: array([-0.00548753,  0.01456368, -0.01479078, ...,  0.00033246,
       -0.03030233,  0.01414809], dtype=float32), 30: array([-0.00262019,  0.01523388, -0.00583289, ...,  0.00387635,
       -0.02096341,  0.00537051], dtype=float32), 29: array([ 0.00539364,  0.00761024, -0.00342773, ...,  0.00272651,
       -0.01468302,  0.00859352], dtype=float32), 28: array([ 0.00527371,  0.00772538, -0.01171751, ..., -0.0025048 ,
       -0.00416457,  0.01363646], dtype=float32), 27: array([-0.0017543 ,  0.00111698, -0.00710752, ..., -0.00594466,
        0.00122875,  0.01089394], dtype=float32), 26: array([-0.00748668, -0.00333119, -0.01241771, ..., -0.00567468,
       -0.00183091,  0.01527699], dtype=float32), 25: array([-0.00477668, -0.00412272, -0.01340128, ..., -0.00352773,
       -0.00604033,  0.02428165], dtype=float32), 24: array([-0.01362917,  0.00023581, -0.01590318, ..., -0.00290011,
       -0.0029043 ,  0.02621616], dtype=float32), 23: arra

---

Lay Summary Simplicity

In [1]:
import os
import json
from tqdm import tqdm

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from repeng import ControlVector, ControlModel, DatasetEntry

In [3]:
os.getcwd()

'/home/mingcong/scripts'

In [4]:
with open('../../../data/colx531/biolaysumm2024_data/eLife_test.jsonl', "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

In [5]:
def baseline_model(doc):
    return doc["sections"]["Abstract"]

In [6]:
for item in data:
    sections = item["article"].split("\n")
    item["sections"] = dict(zip(item["headings"], sections))

abstracts = []
for item in tqdm(data, leave=True):
    rephrased_abstract = baseline_model(item)
    abstracts.append({"id": item["id"], "prediction": rephrased_abstract})

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 1323535.93it/s]


In [7]:
abstracts_labeled = ["[INST]Abstract:" + item['prediction'] + "\n Lay summary of abstract: [/INST]" for item in abstracts]

In [8]:
model_name = "BioMistral/BioMistral-7B-DARE-AWQ-QGS128-W4-GEMM"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
#  wrap it in a ControlModel for later.
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [9]:
def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
) -> list[DatasetEntry]:
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{user_tag} {positive_template} {asst_tag} {suffix}",
                    negative=f"{user_tag} {negative_template} {asst_tag} {suffix}",
                )
            )
    return dataset

In [18]:
import gc

In [53]:
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()

0

In [10]:
simple_dataset = make_dataset(
    "{persona}.",
    ["I have an abstract from a bio-medical research paper that I would like to make more understandable for a wider audience, including those without a scientific background. Please convert the technical language into simpler terms, explain any complex concepts in a way that a layperson could understand, and provide additional background information where necessary to help clarify the relevance and significance of the research."],
    ["I have a simplified summary of findings from a bio-medical research paper that I need to be rewritten for a professional audience with a high level of expertise in this field. Please enhance the language to match the sophistication expected in a scholarly article, incorporate appropriate technical jargon."],
    abstracts_labeled,
)
model.reset()
simple_vector = ControlVector.train(model, tokenizer, simple_dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:59<00:00,  6.56s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:01<00:00, 30.99it/s]


In [11]:
simple_vector.export_gguf('simple_vector.gguf')

gguf: This GGUF file is for Little Endian only


In [50]:
def generate_with_vector(
    input: str,
    vector: ControlVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.1,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    if user_tag not in input:
        input = f"{user_tag} {input.strip()} {asst_tag}"
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    
    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    
    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    model.reset()

In [45]:
text = abstracts[0]['prediction']
text

'Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helices arranged to form a large cytoplasmic cavity lined by residues known to be essential for function . Further molecular dynamics simulations support a model where the acyl-coA donor spans the membrane through accessing a pore created by movement of an impo

In [55]:
prompt = f"""
    Rephrase the following abstract from a medical paper to make it more accessible and understandable to non-expert audiences, commonly referred to as "lay summaries".
    [INST]Abstract: {text}
    Lay summary of abstract: [/INST]"""

In [56]:
generate_with_vector(
    prompt,
    simple_vector,
    (2.0, -1.7),
    max_new_tokens=2048,
)

==baseline ---------------------------------------------------
<s> 
    Rephrase the following abstract from a medical paper to make it more accessible and understandable to non-expert audiences, commonly referred to as "lay summaries".
    [INST]Abstract: Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helic

In [57]:
generate_with_vector(
    prompt,
    simple_vector,
    (1.0, -1.0),
    max_new_tokens=2048,
)

==baseline ---------------------------------------------------
<s> 
    Rephrase the following abstract from a medical paper to make it more accessible and understandable to non-expert audiences, commonly referred to as "lay summaries".
    [INST]Abstract: Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helic

In [78]:
generate_with_vector(
    prompt,
    simple_vector,
    (0.5, -0.5),
    max_new_tokens=2048,
)

==baseline ---------------------------------------------------
<s> 
    Rephrase the following abstract from a medical paper to make it more accessible and understandable to non-expert audiences, commonly referred to as "lay summaries".
    [INST]Abstract: Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helic

In [79]:
generate_with_vector(
    prompt,
    simple_vector,
    (0.1, -0.2),
    max_new_tokens=2048,
)

==baseline ---------------------------------------------------
<s> 
    Rephrase the following abstract from a medical paper to make it more accessible and understandable to non-expert audiences, commonly referred to as "lay summaries".
    [INST]Abstract: Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helic

Evaluate

In [58]:
from bert_score import score
from rouge_score import rouge_scorer

In [81]:
pred_baseline = ['Scientists have found that many different types of living things add chemical groups called acyls to sugars. These acyls are added by proteins that have a special part called the acyltransferase-3 (AT3) domain. These proteins are important for many different processes in bacteria, such as forming relationships with other organisms, protecting against viruses and drugs, and making antibiotics. However, scientists do not know much about how these proteins work or what they look like. To learn more about them, scientists used a computer program to create a model of the structure of one of these proteins, called OafB. They found that this protein has a unique shape and that it works by taking an acyl group from inside the cell and putting it outside the cell near the place where it might be used. Scientists also found that another part of the protein, called the SGNH domain, helps the acyl group get put outside the cell. By understanding how these proteins work, scientists may be able to find ways to stop them from working, which could help fight against harmful bacteria.']
pred_plus_2 = ["""Think of people who make food food! Just like making food, our bodies need tiny things to make everything inside our bodies. To make everything in our bodies, we need tiny things that people make. These tiny things are like food for our bodies. When our bodies make food, sometimes we need special food to help our bodies make food. This special food is like food that makes food. So, when our bodies make food, we need special food to help our bodies make food. Sometimes, our bodies have special people who make food to help our bodies make food. These people are like people who make food. And just like people who make food, the people who make food in our bodies are like people who make food. We think that everyone's food is like food that helps people make food."""]
pred_plus_1 = ["""Scientists have found out how bacteria make things like antibiotics and protect themselves from viruses. They made a computer model of how bacteria do this and found out how they work. This helps scientists find ways to stop bacteria from making things that can hurt us."""]
pred_plus_0p5 = ["""Scientists have found a way to create a computer model of a protein that helps bacteria make things like antibiotics. This protein has a special part inside it that helps it work. Scientists used a special computer program to figure out how this part works. They also figured out how the rest of the protein works. This information will help scientists find ways to stop bacteria from making things like antibiotics."""]
pred_plus_0p1 = ["""Scientists have discovered that a type of protein called acyltransferase-3 (AT3) is found in many different types of living organisms. These proteins help to add chemical groups called acyl groups to other molecules called carbohydrates. In bacteria, these proteins play important roles in things like making antibiotics, protecting against viruses and drugs, and forming relationships with other organisms. However, scientists do not know much about how these proteins work or what they look like. To learn more about them, researchers used a special computer program to create a model of what the protein might look like based on its genetic code. They then used another computer program to simulate what might happen when the protein interacts with other molecules. The results showed that the protein has a unique shape and that it may work by allowing acyl groups to pass through a hole in the middle of the protein. This allows the protein to add acyl groups to other molecules on the outside of the cell. Researchers also found that the protein works together with another type of protein called SGNH, which helps to find molecules to add acyl groups to. Overall, this research provides a better understanding of how these important proteins work and could help scientists develop new ways to fight bacterial infections."""]
pred_minus_1p7 = ["""Acylation of various carbohydrates is a common phenomenon observed across all domains of life and is facilitated by a variety of protein structures. However, the structure and mechanism of such acyltransferases have not been thoroughly investigated, particularly in the case of bacterial O-antigen modifying acetyltransferase, OafB. This study employed evolutionary co-variance analysis to construct a computational model of the structure of OafB, which exhibits a novel fold for the AT3 domain. Molecular dynamics simulations were utilized to demonstrate the stability of the AT3 domain within the membrane. The AT3 domain features ten transmembrane helices, forming a large cytoplasmic cavity, and is proposed to present the acetyl group close to the likely catalytic residues on the extracytoplasmic surface via a pore created by the movement of an important loop capping the inner cavity. Additionally, limited but significant interactions with the fused SGNH domain are identified, and modelling suggests this domain may be mobile and capable of accepting acyl-groups from the AT3 and subsequently reaching acceptor substrates. Consequently, this new general model of AT3 function provides a framework for the development of inhibitors that could abrogate critical functions of bacterial pathogens."""]
pred_minus_1 = ["""The acetylation of various carbohydrates is a common process observed across all domains of life. This process can be catalyzed by proteins containing a membrane-bound acyltransferase-3 (AT3) domain (PF01757). However, the structure and mechanism of these proteins remain largely unknown, particularly in the context of bacterial O-antigen modifying acetyltransferases such as OafB. To address this gap, a computational model of the structure of OafB was developed using evolutionary co-variance analysis. The resulting structure exhibits a novel fold for the AT3 domain, which was further validated through molecular dynamics simulations demonstrating its stability in the membrane. The AT3 domain contains 10 transmembrane helices arranged to form a large cytoplasmic cavity lined by residues known to be essential for function. Molecular dynamics simulations suggest that the acyl-CoA donor spans the membrane through accessing a pore created by movement of an important loop capping the inner cavity, enabling OafB to present the acetyl group close to the likely catalytic residues on the extracytoplasmic surface. Limited but important interactions with the fused SGNH domain in OafB were also identified, and modeling suggests this domain is mobile and can both accept acyl-groups from the AT3 and then reach beyond the membrane to reach acceptor substrates. Overall, this new general model of AT3 function provides a framework for the development of inhibitors that could abrogate critical functions of bacterial pathogens."""]
pred_minus_0p5 = ["""This study presents a computational model of the structure of a bacterial protein responsible for acetylation of diverse carbohydrates. The resulting structure exhibits a novel fold for the AT3 domain, which is stable in the membrane. The AT3 domain contains 10 transmembrane helices arranged to form a large cytoplasmic cavity lined by residues known to be essential for function. Molecular dynamics simulations suggest that the acyl-CoA donor spans the membrane through accessing a pore created by movement of an important loop capping the inner cavity, enabling the protein to present the acetyl group close to the likely catalytic residues on the extracytoplasmic surface. Limited but important interactions with the fused SGNH domain in the protein are identified, and modeling suggests this domain is mobile and can both accept acyl-groups from the AT3 and then reach beyond the membrane to reach acceptor substrates. Together, this new general model of AT3 function provides a framework for the development of inhibitors that could abrogate critical functions of bacterial pathogens."""]
pred_minus_0p2 = ["""This study presents a computational model of the structure of a bacterial protein called OafB. OafB is responsible for adding an acetyl group to certain sugars found in the cell wall of bacteria. The researchers used a technique called evolutionary co-variance analysis to create a model of OafB's structure. They also performed molecular dynamics simulations to test the stability of the model in the membrane. The results showed that OafB has a unique fold for its acyltransferase-3 (AT3) domain, which contains 10 transmembrane helices arranged to form a large cytoplasmic cavity. The researchers also discovered that OafB uses a specific loop to allow the acyl-CoA donor to pass through the membrane and present the acetyl group to the catalytic residues on the extracytoplasmic surface. Additionally, they found that OafB interacts with a fused SGNH domain, which may help the protein to reach out and modify other molecules beyond the membrane. Overall, this study provides a new general model of AT3 function that could be useful for developing inhibitors to target critical functions of bacterial pathogens."""]

refs = ['Acylation of diverse carbohydrates occurs across all domains of life and can be catalysed by proteins with a membrane bound acyltransferase-3 ( AT3 ) domain ( PF01757 ) . In bacteria , these proteins are essential in processes including symbiosis , resistance to viruses and antimicrobials , and biosynthesis of antibiotics , yet their structure and mechanism are largely unknown . In this study , evolutionary co-variance analysis was used to build a computational model of the structure of a bacterial O-antigen modifying acetyltransferase , OafB . The resulting structure exhibited a novel fold for the AT3 domain , which molecular dynamics simulations demonstrated is stable in the membrane . The AT3 domain contains 10 transmembrane helices arranged to form a large cytoplasmic cavity lined by residues known to be essential for function . Further molecular dynamics simulations support a model where the acyl-coA donor spans the membrane through accessing a pore created by movement of an important loop capping the inner cavity , enabling OafB to present the acetyl group close to the likely catalytic resides on the extracytoplasmic surface . Limited but important interactions with the fused SGNH domain in OafB are identified , and modelling suggests this domain is mobile and can both accept acyl-groups from the AT3 and then reach beyond the membrane to reach acceptor substrates . Together this new general model of AT3 function provides a framework for the development of inhibitors that could abrogate critical functions of bacterial pathogens . ']

In [82]:
all_preds = [pred_baseline, pred_plus_2, pred_plus_1, pred_plus_0p5, pred_plus_0p1, pred_minus_1p7, pred_minus_1, pred_minus_0p5, pred_minus_0p2]

In [85]:
all_pairs = [(pred, refs) for pred in all_preds]

In [80]:
def batch_process_decorator(func):
    def wrapper(list_of_pairs):
        results = []
        for preds, refs in list_of_pairs:
            result = func(preds, refs)
            results.append(result)
        return results
    return wrapper

Relevance

In [88]:
@batch_process_decorator
def calc_rouge(preds, refs):
    # Get ROUGE F1 scores
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeLsum"], use_stemmer=True, split_summaries=True
    )
    scores = [scorer.score(p, refs[i]) for i, p in enumerate(preds)]
    return (
        np.mean([s["rouge1"].fmeasure for s in scores]),
        np.mean([s["rouge2"].fmeasure for s in scores]),
        np.mean([s["rougeLsum"].fmeasure for s in scores]),
    )

In [61]:
def calc_bertscore(preds, refs):
    # Get BERTScore F1 scores
    P, R, F1 = score(preds, refs, lang="en", verbose=True, device="cuda:0")
    return np.mean(F1.tolist())

In [90]:
rouge_result = calc_rouge(all_pairs)

In [100]:
index_names = ['pred_baseline', 'pred_plus_2', 'pred_plus_1', 'pred_plus_0p5', 'pred_plus_0p1', 'pred_minus_1p7', 'pred_minus_1', 'pred_minus_0p5', 'pred_minus_0p2']

In [101]:
pd.DataFrame(rouge_result, columns = ["rouge1", "rouge2", "rougeLsum"], index=index_names)

Unnamed: 0,rouge1,rouge2,rougeLsum
pred_baseline,0.343826,0.087591,0.31477
pred_plus_2,0.106952,0.0,0.106952
pred_plus_1,0.115523,0.021818,0.108303
pred_plus_0p5,0.172185,0.033333,0.152318
pred_plus_0p1,0.36689,0.067416,0.33557
pred_minus_1p7,0.732861,0.517815,0.671395
pred_minus_1,0.854664,0.723312,0.824295
pred_minus_0p5,0.79198,0.740554,0.776942
pred_minus_0p2,0.619512,0.308824,0.580488


In [92]:
import numpy as np
import pandas as pd

In [63]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/mingcong/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Readability

In [106]:
def batch_process_decorator(func):
    def wrapper(list_of_pairs):
        results = []
        for preds in list_of_pairs:
            result = func(preds)
            results.append(result)
        return results
    return wrapper

In [107]:
@batch_process_decorator
def calc_readability(preds):
    fkgl_scores = []
    cli_scores = []
    dcrs_scores = []
    for pred in preds:
        fkgl_scores.append(textstat.flesch_kincaid_grade(pred))
        cli_scores.append(textstat.coleman_liau_index(pred))
        dcrs_scores.append(textstat.dale_chall_readability_score(pred))
    return np.mean(fkgl_scores), np.mean(cli_scores), np.mean(dcrs_scores)

In [74]:
import textstat

In [108]:
readability_result = calc_readability(all_preds)

In [109]:
pd.DataFrame(readability_result, columns = ["fkgl", "cli", "dcrs"], index=index_names)

Unnamed: 0,fkgl,cli,dcrs
pred_baseline,9.7,11.32,8.25
pred_plus_2,3.6,6.77,1.2
pred_plus_1,6.9,10.14,7.83
pred_plus_0p5,7.6,9.97,7.68
pred_plus_0p1,10.5,12.18,8.52
pred_minus_1p7,17.4,16.66,12.33
pred_minus_1,15.4,16.19,11.43
pred_minus_0p5,15.3,14.92,11.19
pred_minus_0p2,11.8,13.29,10.68


Factuality

In [110]:
def calc_alignscore(preds, docs):
    alignscorer = AlignScore(
        model="distilroberta-base",
        batch_size=8,
        device="cuda:0",
        ckpt_path="../../models/AlignScore-base.ckpt",
        evaluation_mode="nli_sp",
    )
    return np.mean(alignscorer.score(contexts=docs, claims=preds))

In [111]:
def cal_summac(preds, docs):
    model_conv = SummaCConv(
        models=["vitc-base"],
        bins="percentile",
        granularity="sentence",
        nli_labels="e",
        device="cuda",
        start_file="default",
        agg="mean",
    )
    return np.mean(model_conv.score(docs, preds)["scores"])

In [119]:
docs = [data[0]["article"]]

In [121]:
from alignscore import AlignScore

In [122]:
print(calc_alignscore(pred_baseline, docs))
print(calc_alignscore(pred_plus_control, docs))
print(calc_alignscore(pred_minus_control, docs))

FileNotFoundError: [Errno 2] No such file or directory: '/home/mingcong/scripts/../../models/AlignScore-base.ckpt'

In [124]:
from summac.model_summac import SummaCConv

In [125]:
print(cal_summac(pred_baseline, docs))
print(cal_summac(pred_plus_control, docs))
print(cal_summac(pred_minus_control, docs))

--2024-03-29 14:13:33--  https://github.com/tingofurro/summac/raw/master/summac_conv_vitc_sent_perc_e.bin
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tingofurro/summac/master/summac_conv_vitc_sent_perc_e.bin [following]
--2024-03-29 14:13:33--  https://raw.githubusercontent.com/tingofurro/summac/master/summac_conv_vitc_sent_perc_e.bin
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1811 (1.8K) [application/octet-stream]
Saving to: ‘summac_conv_vitc_sent_perc_e.bin’

     0K .                                                     100% 5

<All keys matched successfully>


tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
  histograms = torch.FloatTensor(histograms).to(self.device)


0.3737916350364685
<All keys matched successfully>
0.3673260509967804
<All keys matched successfully>
0.5895997881889343
