In [34]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline

model_name = "facebook/bart-large-cnn"
model_name2 = "openai-community/gpt2"

# Create a text-generation pipeline using the loaded model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer_pipeline = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    framework="pt"
)

model2 = AutoModelForCausalLM.from_pretrained(model_name2)
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

# Create a text-generation pipeline using the loaded model
aggregator_pipeline = pipeline(
    "text-generation",
    model= model2,
    tokenizer=tokenizer2,
)

Device set to use cpu
Device set to use cpu


In [42]:
def generate_summary(text, max_length=130, min_length=30, temperature=1.0):
    # Clean and prepare input text
    joined_responses = "\n".join(text)
    cleaned_text = joined_responses.strip()

    new_max_length = min(max_length, len(cleaned_text))
    if len(cleaned_text) < max_length:
      new_max_length = max(len(cleaned_text) - 10, min_length + 5)

    # Generate summary
    summary = summarizer_pipeline(
        cleaned_text,
        max_length=new_max_length,
        min_length=min_length,
        temperature=temperature,
        do_sample=False
    )
    print(summary)
    return summary[0]['summary_text']

In [44]:
def aggregate_response(user_prompt: str, summarized_text: str, max_length: int = 130, min_length: int=30, temperature: float = 0.7) -> str:
  # Construct the aggregator prompt
    aggregator_prompt = f"""
You are given the user's prompt, along with a summarization of expert responses to this prompt. Your task is to create a
helpful response to the user's prompt basedon the summary of the expert responses.


User Query:
{user_prompt}

Summarization of Expert Responses:
{summarized_text}

[ANSWER_START]
"""

    # Generate the aggregated answer
    generated_output = aggregator_pipeline(
        aggregator_prompt,
        max_length=len(aggregator_prompt) + max_length,
        min_length=min_length,
        temperature=temperature,
        pad_token_id=tokenizer2.eos_token_id,
        do_sample=True
    )

    # Extract the generated text
    aggregated_answer = generated_output[0]['generated_text']
    if "[ANSWER_START]" in aggregated_answer:
        aggregated_answer = aggregated_answer.split("[ANSWER_START]")[1].strip()
    else:
        aggregated_answer = aggregated_answer[len(aggregator_prompt):].strip()

    return aggregated_answer

In [None]:
def final_answer(user_prompt: str, expert_outputs: list[str], max_length: int = 130, min_length: int=30, temperature: float = 1.0) -> str:
    """
    Aggregates the responses from multiple expert models into a coherent answer using a Llama-based model.

    Args:
        user_prompt (str): The original user prompt.
        expert_outputs (list[str]): A list of responses from expert models.
        max_length (int): The maximum length of the generated output.
        temperature (float): Sampling temperature for generation.

    Returns:
        str: A coherent aggregated answer.
    """
    # get summary text
    summarized_text = generate_summary(expert_outputs, max_length, min_length, temperature)

    aggregated_answer = aggregate_response(user_prompt, summarized_text, max_length, min_length, 0.7)

    return aggregated_answer

# Example usage:
if __name__ == "__main__":
    user_query = "How can I optimize the performance of my deep learning models in production?"
    expert_responses = [
        "Optimizing data pipelines and leveraging hardware accelerators like GPUs or TPUs will further improve throughput.",
        "Consider using model quantization and pruning techniques to reduce model size and inference latency.",
        "Implementing efficient serving architectures such as TensorRT or ONNX Runtime can boost performance."
    ]

    final_answer = final_answer(user_query, expert_responses)
    print("Aggregated Answer:\n", final_answer)