In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline

model_name = "facebook/bart-large-cnn"
model_name2 = "openai-community/gpt2"

# Create a text-generation pipeline using the loaded model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
summarizer_pipeline = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    framework="pt"
)

model2 = AutoModelForCausalLM.from_pretrained(model_name2)
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

# Create a text-generation pipeline using the loaded model
aggregator_pipeline = pipeline(
    "text-generation",
    model= model2,
    tokenizer=tokenizer2,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [2]:
def generate_summary(text, max_length=130, min_length=30, temperature=1.0):
    # Clean and prepare input text
    joined_responses = "\n".join(text)
    cleaned_text = joined_responses.strip()

    new_max_length = min(max_length, len(cleaned_text))
    if len(cleaned_text) < max_length:
      new_max_length = max(len(cleaned_text) - 10, min_length + 5)

    # Generate summary
    summary = summarizer_pipeline(
        cleaned_text,
        max_length=new_max_length,
        min_length=min_length,
        temperature=temperature,
        do_sample=False
    )
    print(summary)
    return summary[0]['summary_text']

In [7]:
def aggregate_response(user_prompt: str, summarized_text: str, max_length: int = 130, min_length: int=30, temperature: float = 0.7) -> str:
  # Construct the aggregator prompt
    aggregator_prompt = f"""
You are given the user's prompt, along with a summarization of expert responses to this prompt. Your task is to create a
helpful response to the user's prompt, based 0n the summary of the expert responses.


User Query:
{user_prompt}

Summarization of Expert Responses:
{summarized_text}

[ANSWER_START]
"""

    # Generate the aggregated answer
    generated_output = aggregator_pipeline(
        aggregator_prompt,
        max_length=len(aggregator_prompt) + max_length,
        min_length=min_length,
        temperature=temperature,
        pad_token_id=tokenizer2.eos_token_id,
        do_sample=True
    )

    # Extract the generated text
    aggregated_answer = generated_output[0]['generated_text']
    if "[ANSWER_START]" in aggregated_answer:
        aggregated_answer = aggregated_answer.split("[ANSWER_START]")[1].strip()
    else:
        aggregated_answer = aggregated_answer[len(aggregator_prompt):].strip()

    return aggregated_answer

In [8]:
def final_answer(user_prompt: str, expert_outputs: list[str], max_length1: int = 50, max_length2: int = 130, min_length: int=30, temperature: float = 1.0) -> str:
    """
    Aggregates the responses from multiple expert models into a coherent answer using a Llama-based model.

    Args:
        user_prompt (str): The original user prompt.
        expert_outputs (list[str]): A list of responses from expert models.
        max_length (int): The maximum length of the generated output.
        temperature (float): Sampling temperature for generation.

    Returns:
        str: A coherent aggregated answer.
    """
    # get summary text
    summarized_text = generate_summary(expert_outputs, max_length1, min_length, temperature)

    aggregated_answer = aggregate_response(user_prompt, summarized_text, max_length2, min_length, 0.7)

    return aggregated_answer

# Example usage:
if __name__ == "__main__":
    user_query = "How can I optimize the performance of my deep learning models in production?"
    expert_responses = [
        "Optimizing data pipelines and leveraging hardware accelerators like GPUs or TPUs will further improve throughput.",
        "Consider using model quantization and pruning techniques to reduce model size and inference latency.",
        "Implementing efficient serving architectures such as TensorRT or ONNX Runtime can boost performance."
    ]

    final_answer = final_answer(user_query, expert_responses)
    print("Aggregated Answer:\n", final_answer)

[{'summary_text': 'Consider using model quantization and pruning techniques to reduce model size and inference latency. Optimizing data pipelines and leveraging hardware accelerators like GPUs or TPUs will further improve throughput.'}]
Aggregated Answer:
 You can also use the following techniques to optimize your model pipeline:

Create a sample pipeline, and use the following parameters:

{ "outputs": [ { "time": 1, "outputs": [ { "name": "Model " }, { "name": "Model " } ], "path": "", "value": "", "size": 1 } ], "outputs": [ { "time": 2, "outputs": [ { "name": "Model " }, { "name": "Model " } ], "path": "", "value": "", "size": 2 } ], "outputs": [ { "time": 3, "outputs": [ { "name": "Model " }, { "name": "Model " } ], "path": "", "value": "", "size": 2 } ], "outputs": [ { "time": 4, "outputs": [ { "name": "Model " }, { "name": "Model " } ], "path": "", "value": "", "size": 2 } ] }

To increase efficiency, visualize the outputs in an excel spreadsheet. Compare the results to the prev