In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, pipeline, PreTrainedTokenizer, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers.util import semantic_search

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Prompt
INTRO_BLURB = ("Below is an instruction that describes a task. Write a response that appropriately completes the request.")

# To be added as special tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"


PROMPT_FOR_GENERATION_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT_WITH_INPUT = """{intro}

{instruction_key}
{instruction}

{input_key}
{context}

{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    context="{context}",
    response_key=RESPONSE_KEY,
)

In [3]:
def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        print(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]

def preprocess(tokenizer, instruction_text, context_text=None):
    instruction = "Answer the following question only with the provided input. If no answer is found tell that you cannot answer based on this context. " + instruction_text
    if context_text:
        prompt_text = PROMPT_FOR_GENERATION_FORMAT_WITH_INPUT.format(instruction=instruction, context=context_text)
    else:
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction)

    inputs = tokenizer(prompt_text, return_tensors="pt",)
    inputs["prompt_text"] = prompt_text
    inputs["instruction_text"] = instruction_text
    inputs["context_text"] = context_text
    return inputs

def forward(model, tokenizer, model_inputs, max_length=256):
    input_ids = model_inputs["input_ids"]
    attention_mask = model_inputs.get("attention_mask", None)

    if input_ids.shape[1] == 0:
        input_ids = None
        attention_mask = None
        in_b = 1
    else:
        in_b = input_ids.shape[0]

    generated_sequence = model.generate(
        input_ids=input_ids.to(model.device),
        attention_mask=attention_mask.to(model.device),
        pad_token_id=tokenizer.pad_token_id,
        max_length=max_length
    )

    out_b = generated_sequence.shape[0]
    generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
    instruction_text = model_inputs.pop("instruction_text", None)

    return {
        "generated_sequence": generated_sequence, 
        "input_ids": input_ids,
    }


def postprocess(tokenizer, model_outputs):
    response_key_token_id = get_special_token_id(tokenizer, RESPONSE_KEY_NL)
    end_key_token_id = get_special_token_id(tokenizer, END_KEY)
    generated_sequence = model_outputs["generated_sequence"][0]
    
    # send it to cpu
    generated_sequence = generated_sequence.cpu()
    generated_sequence = generated_sequence.numpy().tolist()
    records = []

    for sequence in generated_sequence:
        decoded = None

        try:
            response_pos = sequence.index(response_key_token_id)
        except ValueError:
            print(f"Could not find response key {response_key_token_id} in: {sequence}")
            response_pos = None

        if response_pos:
            try:
                end_pos = sequence.index(end_key_token_id)
            except ValueError:
                print("Could not find end key, the output is truncated!")
                end_pos = None
                
            if end_pos:
                decoded = tokenizer.decode(sequence[response_pos + 1 : end_pos], skip_special_tokens=True).strip()
            else:
                decoded = "Sorry i cannot answer this question";         
            
        rec = {"generated_text": decoded}
        records.append(rec)
    return records

def get_model_tokenizer(pretrained_model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path);
    model = model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path, 
        torch_dtype = torch.bfloat16,
    );
    model.resize_token_embeddings(len(tokenizer));
    return model, tokenizer

In [4]:
# Load the LLM Dolly v2 3b model with its tokenizer
LLM_model, LLM_tokenizer = get_model_tokenizer(pretrained_model_name_or_path = "./FineTunedDollyV2");
LLM_model = LLM_model.to('cuda');

In [5]:
# Load similarity model
similarity_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
similarity_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

In [6]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [7]:
# Read the extracted questions csv
df = pd.read_csv('./new_passages.csv')
passages = df.passages.to_list()
encoded_input = similarity_tokenizer(passages, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = similarity_model(**encoded_input)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = sentence_embeddings.detach().numpy()
print("INFO:     Loaded knowledge base!")

INFO:     Loaded knowledge base!


In [42]:
query = 'Can we polish marble with a material?'
tokenized_query = similarity_tokenizer(query, padding=True, truncation=True, return_tensors='pt');
embedded_query = similarity_model(**tokenized_query);
question_embeddings = mean_pooling(embedded_query, tokenized_query['attention_mask']);
question_embeddings = question_embeddings.detach().numpy();
retrieved = semantic_search(question_embeddings, sentence_embeddings, top_k=2);
# retrieved = retrieved_all[0][0];
# max_pos = retrieved['corpus_id'];
# max_score = retrieved['score'];
# context = passages[max_pos];
context = []
for row in retrieved[0]:
    context.append(passages[row['corpus_id']]);
context = '\n'.join(context)
pre_process_result = preprocess(LLM_tokenizer, query, context);
model_result = forward(LLM_model, LLM_tokenizer, pre_process_result);
final_output = postprocess(LLM_tokenizer, model_result);
response = final_output[0]['generated_text'];
print(response)

Construction marble is a stone composed of calcite, dolomite or serpentine. It is harder, more glossy and stain resistant than the original surface.
