In [1]:
# import logging
# import sys
import os
from dotenv import load_dotenv
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

load_dotenv()

import llama_index
llama_index.__version__

'0.9.42.post1'

In [2]:
import tiktoken
from llama_index import ServiceContext, set_global_service_context
from llama_index.callbacks import CallbackManager, TokenCountingHandler

# openai 에서 만든 tiktoken을 사용하여 토큰 수를 계산할 수 있음.
token_counter = TokenCountingHandler(
    tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002").encode,
    verbose = True
)
callback_manager = CallbackManager([token_counter])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)
set_global_service_context(service_context) # global context로 설정해야 index를 load할 때에도 tiktoken이 작동함

In [3]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage

# 인덱스가 존재하는지 확인하고, 없을 때만 다시 빌드
try:
    storage_context = StorageContext.from_defaults(persist_dir='./storage/cache/papers/llama2/')
    index = load_index_from_storage(storage_context)
    print('loading from disk')
except:
    documents = SimpleDirectoryReader('assets').load_data()
    # 노드 파싱, 임베딩
    index = VectorStoreIndex.from_documents(documents=documents) #, service_context=service_context) global 이기 때문에 필요 x
    # 인덱스를 디스크에 지속적으로 가지고 있음
    index.storage_context.persist(persist_dir='./storage/cache/papers/llama2/')
    print('persisting from disk')
    
print(token_counter.total_embedding_token_count)

Embedding Token Usage: 77885
Embedding Token Usage: 53020
persisting from disk
130905


In [4]:
token_counter.reset_counts()
response = index.as_query_engine().query('what is llama2?')
print('embedding tokens :', token_counter.total_embedding_token_count, '\n',
      'LLM prompts :', token_counter.prompt_llm_token_count, '\n',
      'LLM completions :', token_counter.completion_llm_token_count, '\n',
      'Total LLM token count :', token_counter.total_llm_token_count, '\n',
)
print(response) # load 했기 때문에 token을 0개 사용함..? 

Embedding Token Usage: 5
LLM Prompt Token Usage: 1179
LLM Completion Token Usage: 124
embedding tokens : 5 
 LLM prompts : 1179 
 LLM completions : 124 
 Total LLM token count : 1303 

Llama 2 is a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. These models, specifically the Llama 2-Chat models, are optimized for dialogue use cases. They have been developed and released with the aim of outperforming open-source chat models on various benchmarks and potentially serving as a substitute for closed-source models. The approach to fine-tuning and safety improvements of Llama 2-Chat is described in detail to encourage the community to build on this work and contribute to the responsible development of LLMs.
