In [12]:
%pip install -q -r requirements.txt
%pip install openai tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from qdrant_client import QdrantClient

# Docs: https://github.com/qdrant/qdrant_client
client = QdrantClient(host='qdrant', port=6333)

In [5]:
from llama_index import download_loader

ReadabilityWebPageReader = download_loader('ReadabilityWebPageReader')
loader = ReadabilityWebPageReader()

In [6]:
!playwright install chromium

Downloading Chromium 110.0.5481.38 (playwright build v1045)[2m from https://playwright.azureedge.net/builds/chromium/1045/chromium-linux.zip[22m
Chromium 110.0.5481.38 (playwright build v1045) downloaded to /home/codespace/.cache/ms-playwright/chromium-1045
Downloading FFMPEG playwright build v1008[2m from https://playwright.azureedge.net/builds/ffmpeg/1008/ffmpeg-linux.zip[22m
FFMPEG playwright build v1008 downloaded to /home/codespace/.cache/ms-playwright/ffmpeg-1008
Downloading Firefox 108.0.2 (playwright build v1372)[2m from https://playwright.azureedge.net/builds/firefox/1372/firefox-ubuntu-20.04.zip[22m
Firefox 108.0.2 (playwright build v1372) downloaded to /home/codespace/.cache/ms-playwright/firefox-1372
Downloading Webkit 16.4 (playwright build v1767)[2m from https://playwright.azureedge.net/builds/webkit/1767/webkit-ubuntu-20.04.zip[22m
Webkit 16.4 (playwright build v1767) downloaded to /home/codespace/.cache/ms-playwright/webkit-1767


In [113]:
import asyncio

async def load_data(url):
    return await asyncio.to_thread(loader.load_data, url)

async def load_langchain_documents(url):
    return await asyncio.to_thread(loader.load_langchain_documents, url=url)

documents = await load_data('https://lethain.com/forty-year-career/')
print(len(documents))
print(documents[0].extra_info_str)

scraped: https://lethain.com/forty-year-career/
1
title: A forty-year career.
length: 15196
excerpt: The Silicon Valley narrative centers on entrepreneurial protagonists who are poised one predestined step away from changing the world. A decade ago they were heroes, and more recently they’ve become villains, but either way they are absolutely the protagonists. Working within the industry, I’ve worked with quite a few non-protagonists who experience their time in technology differently: a period of obligatory toil required to pry open the gate to the American Dream.
byline: None
dir: None
lang: en-us
siteName: None


In [115]:
from llama_index import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=128, chunk_overlap=0, encoding_name="cl100k_base")

def to_langchain_format(document):
    return Document(text=document.text).to_langchain_format()

split_lcdocuments = text_splitter.split_documents([to_langchain_format(d) for d in documents])

print(len(split_lcdocuments))
print(split_lcdocuments[0])


47
page_content='The Silicon Valley narrative centers on entrepreneurial protagonists who are poised one predestined step away from changing the world. A decade ago they were heroes, and more recently they’ve become villains, but either way they are absolutely the protagonists. (Although, perhaps the role of protagonist is expanding a bit.)' metadata={}


In [121]:
lcdoc = split_lcdocuments[4]
display_markdown(lcdoc.page_content)
print(lcdoc.metadata)

but forty years.It’s strange to realize that I lost sight of the forty-year career model, because for a long time it was the only model I knew. Growing up, a white collar career was my only reference point for participating in America’s dwindling middle class. It never occurred to me that alternatives existed,

{}


In [117]:
# empty the collection
client.delete_collection('test')

False

In [122]:
from llama_index import Document, GPTQdrantIndex, LLMPredictor, ServiceContext, PromptHelper
from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
from langchain import OpenAI

# define LLM with custom params
embed_model = OpenAIEmbedding(model=OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002)
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=1024))
prompt_helper = PromptHelper(max_input_size=768, num_output=256, max_chunk_overlap=0)
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

# construct the index
index = GPTQdrantIndex.from_documents([], client=client, collection_name='test', service_context=service_context)

# add documents
for i, lcdocument in enumerate(split_lcdocuments):
    index.insert(Document.from_langchain_format(lcdocument))

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total embedding token usage: 63 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total embedding token usage: 74 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total embedding token usage: 64 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [insert] Total embedding token usage: 69 tokens
INFO:llama_index.token_counter.token_counter:> [insert

In [129]:
import numpy as np

collection_info = client.get_collection("test")

query_vector = np.random.rand(collection_info.config.params.vectors.size)
hits = client.search(
    collection_name="test",
    query_vector=query_vector,
    query_filter=None,  # Don't use any filters for now, search across all indexed points
    append_payload=True,  # Also return a stored payload for found points
    limit=5  # Return 5 closest points
)
print(len(hits))
hits[0].payload

5


{'doc_id': '1b5c7f91-e474-48e9-9a20-7213a7c27451',
 'extra_info': {},
 'text': '\n\nWorking within the industry, I’ve worked with quite a few non-protagonists who experience their time in technology differently: a period of obligatory toil required to pry open the gate to the American Dream.For some, this perspectives builds from the industry’s persistent, casual discrimination and indifference to their lived experiences, but I’ve'}

In [132]:
response = index.query("What is the article about?", mode="embedding", similarity_top_k=5)

# response = index.query("What is the article about?")

print(llm_predictor.last_token_usage)
print(embed_model.last_token_usage)
display_markdown(str(response))
print(response.get_formatted_sources())

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1490 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens


1490
6




The article is about the deliberate efforts of the author to increase their visibility in the software engineering industry over the past decade, such as appearing on a podcast, speaking at Velocity, connecting to venture capitalists through the Margins newsletter, writing blog posts on infrastructure, creating zines, and publishing The Making of. The author also reflects on their journey and sets professional goals for 2019, such as getting An Elegant Puzzle published, speaking at three public conferences, reading books on a handful of specific topics, and starting a small business to learn more. They also discuss the importance of financial security in order to own their pace and learning, and the value of spending four hours a week thinking about the future in order to see further into the future. They look forward to what the next two or three decades may bring. Published on October 8, 2019.

> Source (Doc id: 5e096d2b-1ab1-4751-b570-2a5b60d20050): 



Abstractions ConUma’s appearance on Software Engineering Daily podcast and speaking at Veloci...

> Source (Doc id: 4ce92588-2027-4058-948f-3f996b43d2ca): 



The Margins newsletter has connected them to venture capitalists, journalists and each email ...

> Source (Doc id: 30b022ad-60b1-4c9d-85b7-f836ff92e535): 



a decade in, and I can only imagine what it will look like two or three decades in.I’m excite...

> Source (Doc id: 3ab6120f-b296-4f48-9668-a6f44c01a470): 



both kinds of learning. For 2019, I set myself the professional goals of getting An Elegant P...

> Source (Doc id: 113494e8-df38-40dd-bf70-b99eb4eca128): 



career.Financial security is a prerequisite to own your pace and learning.LearningOne section...


In [72]:
# print(Document.from_langchain_format(split_lcdocuments[0]))
# display_markdown(Document.from_langchain_format(split_lcdocuments[0]).text)
index.save_to_dict()

{'index_struct': {'__type__': <IndexStructType.QDRANT: 'qdrant'>,
  '__data__': {'index_id': 'e9d3181e-3a5d-4cba-b7bc-7b91d2749078',
   'summary': None,
   'nodes_dict': {},
   'doc_id_dict': {},
   'embeddings_dict': {}}},
 'docstore': {'docs': {'ce258026-7eed-4680-bdd2-aae7054bf413': {'text': 'The Silicon Valley narrative centers on entrepreneurial protagonists who are poised one predestined step away from changing the world. A decade ago they were heroes, and more recently they’ve become villains, but either way they are absolutely the protagonists. (Although, perhaps the role of protagonist is expanding a bit.)',
    'doc_id': 'ce258026-7eed-4680-bdd2-aae7054bf413',
    'embedding': None,
    'doc_hash': '56004d25b138b7f44b4635e5d341bd61114877ab6f8a0d1f563e8f6a6ed03826',
    'extra_info': {'title': 'A forty-year career.',
     'length': 15196,
     'excerpt': 'The Silicon Valley narrative centers on entrepreneurial protagonists who are poised one predestined step away from changing

In [1]:
from llama_index.readers.qdrant import QdrantReader
from llama_index.indices import GPTListIndex

reader = QdrantReader(host='qdrant')

In [2]:
query_vector = []

In [3]:
documents = reader.load_data(collection_name='immigration', query_vector=query_vector, limit=5)

ResponseHandlingException: [Errno -2] Name or service not known