In [1]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'


--2024-06-06 16:40:41--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-06-06 16:40:42 (718 KB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from llama_index.core.memory import ChatMemoryBuffer
import pandas as pd
from llama_index.core import Document

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
# ollama

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# data = SimpleDirectoryReader(input_dir="./data/paul_graham/").load_data()

def create_docs(dataset_csv):
    df = pd.read_csv(dataset_csv)
    docs = []
    for title, link, authors, year, full_text in df[['title','link','authors','year','full_text']].values.tolist():
        doc = Document(text=full_text, extra_info={"title": title, "link": link, "authors": authors, "year": year})
        docs.append(doc)
    return docs    

DATASET_CSV = 'data/historiography1_full.csv'
docs = create_docs(DATASET_CSV)
index = VectorStoreIndex.from_documents(docs)

2024-06-06 16:50:49.355679: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from llama_index.core.memory import ChatMemoryBuffer

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_K_M.gguf"
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.2,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3500,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 999},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)
Settings.llm = llm



llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /tmp/llama_index/models/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

In [10]:
# memory = ChatMemoryBuffer.from_defaults(token_limit=512)

chat_engine = index.as_chat_engine(
    chat_mode="context",
    # memory=memory,
    system_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about a historic papers on \"nogai horde\". Please response in an academic manner."
    ),
)

In [27]:
query = "What's nogai horde? Give a short definition."
response = chat_engine.chat(query)

print(f"Prompt  : {query}")
def get_reference(source_content):
    return '\n\t'.join([source_content_line.strip() for source_content_line in source_content.split('\n') if source_content_line.startswith(("title:", "link:", "authors:", "year:")) ])
print(f"Source:\n\t{get_reference(response.sources[0].content)}")
print(f"Response: {response.response}")


Llama.generate: prefix-match hit

llama_print_timings:        load time =     274.54 ms
llama_print_timings:      sample time =      50.87 ms /    93 runs   (    0.55 ms per token,  1828.33 tokens per second)
llama_print_timings: prompt eval time =     114.40 ms /   115 tokens (    0.99 ms per token,  1005.28 tokens per second)
llama_print_timings:        eval time =    1456.38 ms /    92 runs   (   15.83 ms per token,    63.17 tokens per second)
llama_print_timings:       total time =    1818.91 ms /   207 tokens


Prompt  : What's nogai horde? Give a short definition.
Source:
	title: Ногайская Орда в системе международных отношений рубежа XV-XVI вв
	link: https://cyberleninka.ru/article/n/nogayskaya-orda-v-sisteme-mezhdunarodnyh-otnosheniy-rubezha-xv-xvi-vv
	authors: Моисеев Максим Владимирович
	year: 2016
Response:   The Nogai Horde was a medieval Turkic state that existed from the 14th to 16th centuries in the western part of the Eurasian steppes, primarily in present-day Russia and Kazakhstan. It was formed by the Nogai people, a branch of the Golden Horde, and was known for its military prowess and ability to maintain independence despite encroachment from neighboring powers.


In [28]:
query = "What's Sarai? Give a short definition."
response = chat_engine.chat(query)

print(f"Prompt  : {query}")
def get_reference(source_content):
    return '\n\t'.join([source_content_line.strip() for source_content_line in source_content.split('\n') if source_content_line.startswith(("title:", "link:", "authors:", "year:")) ])
print(f"Source:\n\t{get_reference(response.sources[0].content)}")
print(f"Response: {response.response}")

Llama.generate: prefix-match hit

llama_print_timings:        load time =     274.54 ms
llama_print_timings:      sample time =      75.75 ms /   140 runs   (    0.54 ms per token,  1848.16 tokens per second)
llama_print_timings: prompt eval time =    1440.88 ms /  2769 tokens (    0.52 ms per token,  1921.75 tokens per second)
llama_print_timings:        eval time =    2343.63 ms /   139 runs   (   16.86 ms per token,    59.31 tokens per second)
llama_print_timings:       total time =    4166.48 ms /  2908 tokens


Prompt  : What's Sarai? Give a short definition.
Source:
	title: О книге Б. Г. Аягана "Абулхаир Шейбанид - последний правитель Дашти-Кыпчака"
	link: https://cyberleninka.ru/article/n/o-knige-b-g-ayagana-abulhair-sheybanid-posledniy-pravitel-dashti-kypchaka
	authors: Алпысбес Махсат Алпысбесулы
	year: 2019
	title: Специфика вотчинных прав башкир-семиродцев
	link: https://cyberleninka.ru/article/n/spetsifika-votchinnyh-prav-bashkir-semirodtsev
	authors: Азнабаев Б. А.
	year: 2012
Response:   Sarai (also spelled Saray or Sarai-Jük) is a medieval city located in present-day Kazakhstan, near the Caspian Sea. It was the capital of the Golden Horde, a Mongol khanate that ruled over much of Eastern Europe and Central Asia during the 13th to 14th centuries. Sarai was an important center of trade, culture, and religion, and was known for its impressive architecture, including mosques, palaces, and bathhouses. Today, the site of Sarai is a UNESCO World Heritage Site and is being excavated and stu

In [None]:
query = "What's ? Give a short definition."
response = chat_engine.chat(query)

print(f"Prompt  : {query}")
def get_reference(source_content):
    return '\n\t'.join([source_content_line.strip() for source_content_line in source_content.split('\n') if source_content_line.startswith(("title:", "link:", "authors:", "year:")) ])
print(f"Source:\n\t{get_reference(response.sources[0].content)}")
print(f"Response: {response.response}")