### Infering on deployed RAG System

In [2]:
from google.cloud import aiplatform
from google.auth import credentials, load_credentials_from_dict
import json
from google.protobuf.json_format import MessageToDict
from datetime import datetime

# Load credentials and initialize client
credentials, project_id = load_credentials_from_dict(
    json.load(open('research-paper-rag-0a8819b735b9.json'))
)

client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
client = aiplatform.gapic.PredictionServiceClient(
    client_options=client_options,
    credentials=credentials
)

# Endpoint configuration
project = "research-paper-rag"
location = "us-central1"
endpoint_id = "3729166308927864832"
endpoint = f"projects/{project}/locations/{location}/endpoints/{endpoint_id}"

def format_references(references):
    """Format references into a readable string"""
    formatted_text = ""
    for i, ref in enumerate(references, 1):
        ref_dict = dict(ref)
        formatted_text += f"\n[{i}] Title: {ref_dict.get('title', '')}\n"
        formatted_text += f"    Authors: {', '.join(ref_dict.get('authors', []))}\n"
        formatted_text += f"    Categories: {ref_dict.get('categories', '')}\n"
        formatted_text += f"    Relevance Score: {ref_dict.get('relevance_score', 0.0):.2f}\n"
        formatted_text += f"    Citation: {ref_dict.get('citation', '')}\n\n"
    return formatted_text

def predict_query(query, max_tokens):
    try:
        # Prepare input data
        input_data = {
            "instances": [{
                "query": query,
                'max_tokens': max_tokens,
                'num_papers': 2
            }]
        }
        
        # Get prediction
        response = client.predict(endpoint=endpoint, instances=input_data["instances"])
        prediction = dict(response.predictions[0])
        
        # Format response
        main_response = prediction['response']
        references = format_references(prediction['references'])
        
        return main_response, references
    
    except Exception as e:
        return f"Error: {str(e)}", "Error retrieving references"
    
Query = "What is the Transformer model?"
max_tokens = 300
response, references = predict_query(Query, max_tokens)

print("\n=== Prediction Results ===\n")
print(f"Query: {Query}\n")
print(f"Response: {response}\n")
print("References:")
print(references)


=== Prediction Results ===

Query: What is the Transformer model?

Response: The transformer is a neural network component that can be used to learn useful representations of sequences or sets of data-points. The transformer has driven recent advances in natural language processing, computer vision, and spatio-temporal modelling.

References:

[1] Title: An Introduction to Transformers
    Authors: Richard E. Turner
    Categories: cs.LG cs.AI
    Relevance Score: 0.52
    Citation: Richard E. Turner. "An Introduction to Transformers". cs.LG cs.AI.


[2] Title: A Survey of Techniques for Optimizing Transformer Inference
    Authors: Krishna Teja Chitty-Venkata, Sparsh Mittal, Murali Emani, Venkatram
  Vishwanath, Arun K. Somani
    Categories: cs.LG cs.AR cs.CL cs.CV
    Relevance Score: 0.48
    Citation: Krishna Teja Chitty-Venkata, Sparsh Mittal, Murali Emani, Venkatram
  Vishwanath, Arun K. Somani. "A Survey of Techniques for Optimizing Transformer Inference". cs.LG cs.AR cs.CL cs

### Infering on Original Model and Comparing

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text2text-generation", model="google/flan-t5-base")

# inference
output = pipe("what is Transformer model?")
print(output)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


[{'generated_text': 'a samurai'}]
