In [44]:
#load
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(query="2405.17147")
docs = loader.load()


In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
chunk_data = text_splitter.split_documents(docs)
len(chunk_data)

32

In [46]:
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
load_dotenv()
!pip install langchain_cohere
from langchain_cohere import CohereEmbeddings

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize Cohere embeddings
cohere_embeddings = CohereEmbeddings(model="embed-english-v2.0", cohere_api_key="pGdW2vRcRYnh8PKvJoZ5Slm8j0p1ZNHHw8bYVQTo")







In [48]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)
index_name = "rag2"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=4096,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

In [49]:
#Embed and store
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore.from_documents(
        chunk_data, 
        cohere_embeddings, 
        index_name=index_name
    )


In [50]:
query = "What metrics are used to evaluate the quality of experience (QoE) for users of large language model (LLM) service"

retriever = vector_store.as_retriever(search_kwargs = {"k":3})
retriever.get_relevant_documents(query)


[Document(metadata={'Authors': 'Haiwei Dong, Shuang Xie', 'Published': '2024-05-27', 'Summary': "The rapid advancement of Large Language Models (LLMs) has significantly\nimpacted human-computer interaction, epitomized by the release of GPT-4o, which\nintroduced comprehensive multi-modality capabilities. In this paper, we first\nexplored the deployment strategies, economic considerations, and sustainability\nchallenges associated with the state-of-the-art LLMs. More specifically, we\ndiscussed the deployment debate between Retrieval-Augmented Generation (RAG)\nand fine-tuning, highlighting their respective advantages and limitations.\nAfter that, we quantitatively analyzed the requirement of xPUs in training and\ninference. Additionally, for the tokenomics of LLM services, we examined the\nbalance between performance and cost from the quality of experience (QoE)'s\nperspective of end users. Lastly, we envisioned the future hybrid architecture\nof LLM processing and its corresponding sus

In [51]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

retrieved_docs = retriever.invoke(query)
print(format_docs(retrieved_docs))

CTSOC NEWS ON CONSUMER TECHNOLOGY
1
Large Language Models (LLMs):
Deployment, Tokenomics and Sustainability
Haiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE
Abstract—The rapid advancement of Large Language Models
(LLMs) has significantly impacted human-computer interaction,
epitomized by the release of GPT-4o, which introduced com-
prehensive multi-modality capabilities. In this paper, we first
explored the deployment strategies, economic considerations,
and sustainability challenges associated with the state-of-the-art
LLMs. More specifically, we discussed the deployment debate
between Retrieval-Augmented Generation (RAG) and fine-tuning,
highlighting their respective advantages and limitations. After
that, we quantitatively analyzed the requirement of xPUs in
training and inference. Additionally, for the tokenomics of LLM
services, we examined the balance between performance and cost
from the quality of experience (QoE)’s perspective of end users.

CTSOC NEWS ON CONSUMER TECH

In [52]:
template = """You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Question: {question}

Context:{context}

Answer:
"""

In [53]:
prompt = template.format(question = query, context =  format_docs(retrieved_docs))
print(prompt)

You are an expert LLM assistant specialized in answering questions related to large language models (LLMs). Use the provided information and your knowledge to respond accurately and clearly to each question. 

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. Use examples where applicable to illustrate your answers.
4. Maintain a professional and helpful tone.

Question: What metrics are used to evaluate the quality of experience (QoE) for users of large language model (LLM) service

Context:CTSOC NEWS ON CONSUMER TECHNOLOGY
1
Large Language Models (LLMs):
Deployment, Tokenomics and Sustainability
Haiwei Dong Senior Member, IEEE, Shuang Xie Member, IEEE
Abstract—The rapid advancement of Large Language Models
(LLMs) has significantly impacted human-computer interaction,
epitomized by the release of GPT-4o, which introduced com-
prehensive multi-modality capabilities. In 

In [54]:
import requests

# Define the variable


# Construct the payload with the variable
payload = {
    "messages": [
        {
            "role": "user",
            "content": "{}".format(query)
        }
    ],
    "system_prompt": "",
    "temperature": 0.9,
    "top_k": 5,
    "top_p": 0.9,
    "max_tokens": 256,
    "web_access": False
}

headers = {
    "x-rapidapi-key": "5c75ae5d92msh98cc178e78ace9dp18774cjsn463b1487b11e",
    "x-rapidapi-host": "chatgpt-42.p.rapidapi.com",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.json())


{'result': "Evaluating Quality of Experience (QoE) in Large Language Model (LLM) services is a complex task as it depends on various factors that influence user satisfaction. Here's an overview of some commonly considered aspects:\n\n1. **Response Accuracy**: The most fundamental metric, response accuracy measures how well the LLM provides correct and relevant information based on the input query or prompt provided by the user. This can be assessed using automated methods like F-score, BLEU score, ROUGE, etc., which compare generated responses with ground truth data from human experts. However, these scores may not always correlate perfectly with perceived QoE due to their limitations.\n2. **Latency/Speed**: Latency refers to the time taken between submitting a request and receiving a complete answer. Users generally expect quick results when interacting with AI systems. Low latencies contribute positively towards better UX while high ones result in frustration and dissatisfaction amon