In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "openai-community/gpt2"  # LLama LLM model I am using for the aggregator

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)  # converts raw text into tokens that the model can understand
model = AutoModelForCausalLM.from_pretrained(model_name)  # transformer-based model specifically designed for casual language modeling

# Create a text-generation pipeline using the loaded model
aggregator_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

def aggregate_response(user_prompt: str, expert_outputs: list[str], max_length: int = 512, temperature: float = 1e-10) -> str:
    """
    Aggregates the responses from multiple expert models into a coherent answer using a Llama-based model.

    Args:
        user_prompt (str): The original user prompt.
        expert_outputs (list[str]): A list of responses from expert models.
        max_length (int): The maximum length of the generated output.
        temperature (float): Sampling temperature for generation.

    Returns:
        str: A coherent aggregated answer.
    """
    # Combine expert responses into a single context string.
    combined_expert_context = ""
    for i in range(len(expert_outputs)):
      combined_expert_context += (expert_outputs[i] + "\nNEWPROMPT\n")

    # summarize prompts
    summarization_prompt = f"""
You are an expert text summarization LLM that condenses long text into a well-structured, detailed summary, while preserving key insights.
Each prompt is seperated by an indentation, then a line that just says 'NEWPROMPT', and then another indentation.
Analyze the given text, extract the most important information, and structure them in a clear and logical manner.

Text to Summarize:
{combined_expert_context}

Provide a clear and concise summary of the given text:
"""

    summarized_text_model = aggregator_pipeline(
        summarization_prompt,
        max_length=max_length,
        temperature=temperature,
        do_sample=True  # Set to False for deterministic output
    )

    summarized_text = summarized_text_model[0]['generated_text']

    if summarization_prompt.strip() in summarized_text:
        summarized_text = summarized_text.split(summarization_prompt.strip())[-1].strip()

    # Construct the aggregator prompt.
    aggregator_prompt = f"""
You are an aggregator LLM designed to analyze the given user query, and the summarization of responses of experts to the same query. The inputs you
are given, are intended to make sure you produce a meaningful and helpful response.Below is the original user query and responses from several experts.
Synthesize the information to produce a single, clear aggregated answer.

User Query:
{user_prompt}

Summarization of Expert Responses:
{summarized_text}

Provide a clear and concise aggregated answer:
"""
    # Generate the aggregated answer using the Llama-based aggregator model.
    generated_output = aggregator_pipeline(
        aggregator_prompt,
        max_length=max_length,
        temperature=temperature,
        do_sample=True  # Set to False for deterministic output
    )

    # The output is a list of dictionaries; extract the generated text.
    aggregated_answer = generated_output[0]['generated_text']

    # triming the prompt from the aggregated answer
    if aggregator_prompt.strip() in aggregated_answer:
        aggregated_answer = aggregated_answer.split(aggregator_prompt.strip())[-1].strip()

    return aggregated_answer

# Example usage:
if __name__ == "__main__":
    # Original user prompt
    user_query = "How can I optimize the performance of my deep learning models in production?"

    # Sample responses from expert models. The router should input these responses into the aggregator at a later step
    expert_responses = [
        "Consider using model quantization and pruning techniques to reduce model size and inference latency.",
        "Implementing efficient serving architectures such as TensorRT or ONNX Runtime can boost performance.",
        "Optimizing data pipelines and leveraging hardware accelerators like GPUs or TPUs will further improve throughput."
    ]

    # Get the aggregated answer using the Llama model.
    final_answer = aggregate_response(user_query, expert_responses)
    print("Aggregated Answer:\n", final_answer)


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Aggregated Answer:
 New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New PROMPT

New
