In [12]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams
from dotenv import load_dotenv
from tqdm import tqdm
import cohere
import uuid
import os

In [2]:
DESKTOP_PATH = "/Users/saschametzger/Desktop/"
FILE_PATH = DESKTOP_PATH + "phi3.pdf"

COLLECTION_NAME = "documents"

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [13]:
META_DATA = {
    "title": "phi3",
}

load_dotenv()

True

In [4]:
loader = PyPDFLoader(FILE_PATH)
pages = loader.load_and_split()
len(pages)

12

In [14]:
qdrant = QdrantClient(url="http://localhost:6333")
co = cohere.Client(os.getenv("COHERE_API_KEY"))

In [6]:
qdrant.delete_collection(COLLECTION_NAME)

qdrant.recreate_collection(
    COLLECTION_NAME,
    vectors_config=VectorParams(
        size=1024,
        distance=Distance.COSINE,
    ),
)

True

In [7]:
# text_splitter = SemanticChunker(
#     FastEmbedEmbeddings(
#         model_name=EMBEDDING_MODEL,
#     ),
#     breakpoint_threshold_type="percentile",
# )

text_splitter = CharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)

In [8]:
content = []
metadata = []
uuids = []

for i, page in enumerate(pages):
    for chunk in text_splitter.split_text(page.page_content):
        content.append(chunk)
        metadata.append(
            {
                **META_DATA,
                "page": str(i + 1),
                "content": chunk,
            }
        )
        uuids.append(str(uuid.uuid4()))

len(content)

12

In [15]:
embeddings = co.embed(
    texts=content,
    model="embed-multilingual-v3.0",
    input_type="search_document",
).embeddings

In [23]:
assert len(content) == len(metadata) == len(uuids) == len(embeddings)
assert len(content) > 0

In [24]:
len(embeddings.embeddings[0])

1024

In [25]:
for i, _ in enumerate(tqdm(range(len(embeddings)))):
    qdrant.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=uuids[i],
                payload={
                    **metadata[i],
                },
                vector=embeddings[i],
            ),
        ],
    )

100%|██████████| 12/12 [00:00<00:00, 212.13it/s]


In [None]:
query = "What is the context length of phi-3?"

query_vector = co.embed(
    texts=content,
    model="embed-multilingual-v3.0",
    input_type="search_query",
).embeddings[0]

In [30]:
qdrant.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=5,
)

[ScoredPoint(id='f4d362eb-66bb-4289-8a73-18fc5a65eafb', version=0, score=0.9058182, payload={'content': 'Phi-3 Technical Report:\nA Highly Capable Language Model Locally on Your Phone\nMicrosoft\nAbstract\nWe introduce phi-3-mini , a 3.8 billion parameter language model trained on 3.3 trillion tokens,\nwhose overall performance, as measured by both academic benchmarks and internal testing, rivals\nthat of models such as Mixtral 8x7B and GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38\non MT-bench), despite being small enough to be deployed on a phone. The innovation lies entirely in\nour dataset for training, a scaled-up version of the one used for phi-2 , composed of heavily filtered\nweb data and synthetic data. The model is also further aligned for robustness, safety, and chat format.\nWe also provide some initial parameter-scaling results with a 7B and 14B models trained for 4.8T\ntokens, called phi-3-small andphi-3-medium , both significantly more capable than phi-3-mini\n