In [3]:
from langchain.embeddings import OpenAIEmbeddings

In [4]:
%env OPENAI_API_KEY YOUR_OPENAI_API_KEY

env: OPENAI_API_KEY=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


In [5]:
"""
By default uses:
    Model: text-embedding-ada-002
    Max chunk length: 8191
"""
embeddings = OpenAIEmbeddings()

In [30]:
query = "What am I doing here?"
query_embedding = embeddings.embed_query(query)

[
    query_embedding,
    # "Dimension is: " + len(query_embedding[0])
]

[[-0.009129731348626166,
  -0.031784058884610035,
  0.015866842753592774,
  -0.016420923828229066,
  -0.0346803865254814,
  0.0042720848535337,
  -0.00843713093665341,
  0.013713486108150824,
  0.017012783073288507,
  -0.016521664995142384,
  0.023472853940936966,
  -0.018813543771888628,
  -0.0024980833818757705,
  0.011579017616597836,
  -0.0005387328779946058,
  -0.0038093018001888524,
  0.03410112248742331,
  -0.014733497640898385,
  -0.004829312867275107,
  -0.012284211372926715,
  -0.021206161852902965,
  -0.015325355954635213,
  0.01740315625923087,
  -0.0029356809106616364,
  0.008361574595807119,
  0.016949818214153114,
  0.009891591429267155,
  -0.025261023157826187,
  0.006995263322734486,
  -0.018662431090196042,
  0.01698759638457626,
  0.007549343466048168,
  -0.015161650859909117,
  -0.017906865819087896,
  -0.01997207371065004,
  0.0008673245083874707,
  0.018561688060637505,
  -0.005446356938401571,
  0.019468366013438237,
  -0.011849761481737922,
  0.01052752458560519

In [22]:
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [26]:
chunk_size = 350
chunk_overlap = 50
length_function = len

recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=length_function
)

character_text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=length_function,
)

## Creating text chunks
Text chunks will be vectorised and put into a storage.

To be maximum usable, the text must be split by its contextual parts. Those contextual parts must conform to the following requirements:
- Be as small as possible;
- Be as focused on the subject as possible;

LLMs can be used to achieve this goal automatically:
1. Derive contextual parts from the text (ask LLM):
2. Split the overall document into chunks by their context (ask LLM);
3. Split the resulting text by a delimiter, usually it will be "\n\n" (can be done with some langchain text splitter or a customer one);
4. Repeat operation to achieve the desired granularity (the simplest check might be by length);

Example of prompt to a generative OpenAI model to breakdown a text into small contextual peaces:
```
Define the main components of the text below based on the context and semantics. Mandatory use of all of the above rules to perform the task:
1. Breakdown the original text into independent blocks by using the defined components;
2. Separate text blocks by an empty line between each other;
3. Define a general title, in a few words, for the text below based on its context;
4. Name each text block according to the general title of the original text and the main component that each block describes;
5. For block naming use only - the general title of the original text and the name of the component based on its context;
6. Do not use numbers in the block's name;
7. After the block's name use a new line;

Text:
{text}
```

The expected output of the command is:
```
General Title: {general_title_name}

{general_title_name} - {block_name}
{block_content}
```

Content preprocessed with the above method is in `docs/preprocessed_with_gpt` directory.

In [27]:
"""
Working with raw content produces not satisfactory results
"""

loaded_pdf = PyPDFLoader(file_path="./docs/what_is_microservice_architecture.pdf")
recursively_split_pages = loaded_pdf.load_and_split(text_splitter=recursive_text_splitter)
character_split_pages = loaded_pdf.load_and_split(text_splitter=character_text_splitter)

[
    recursively_split_pages[0],
    "+++",
    recursively_split_pages[1],
    "+++",
    recursively_split_pages[2],
    "============",
    character_split_pages[0],
    "+++",
    character_split_pages[1],
    "+++",
    character_split_pages[2],
]

[Document(page_content='30/04/2023, 16:12 Microservice architecture style - Azure Architecture Center | Microsoft Learn\nhttps://learn.microsoft.com/en-us/azure/architecture/guide/architecture-styles/microservices 1/5Microservice architecture style\nAzure\nA microservices architecture consists of a collection of small, autonomous services. Each', metadata={'source': './docs/what_is_microservice_architecture.pdf', 'page': 0}),
 '+++',
 Document(page_content='service is self-contained and should implement a single business capability within a\nbounded context. A bounded context is a natural division within a business and provides\nan explicit boundary within which a domain model exists.\nMicroservices are small, independent, and loosely coupled. A single small team of', metadata={'source': './docs/what_is_microservice_architecture.pdf', 'page': 0}),
 '+++',
 Document(page_content='developers can write and maintain a service.\nEach service is a separate codebase, which can be managed by a

In [28]:
"""
Working with context grouped content produces the required chunking result!

Requirements:
    - Contextual blocks are separated by a well identifiable delimiter (\n\n for example);

Actual chunking of context grouped content can be conveniently done via CharacterTextSplitter.
"""

loaded_pdf = TextLoader(file_path="./docs/preprocessed_with_gpt/what_is_microservice_architecture.txt")
recursively_split_pages = loaded_pdf.load_and_split(text_splitter=recursive_text_splitter)
character_split_pages = loaded_pdf.load_and_split(text_splitter=character_text_splitter)

[
    recursively_split_pages[0],
    "+++",
    recursively_split_pages[1],
    "+++",
    recursively_split_pages[2],
    "============",
    character_split_pages[0],
    "+++",
    character_split_pages[1],
    "+++",
    character_split_pages[2],
]

Created a chunk of size 627, which is longer than the specified 350
Created a chunk of size 571, which is longer than the specified 350
Created a chunk of size 365, which is longer than the specified 350
Created a chunk of size 2223, which is longer than the specified 350
Created a chunk of size 2272, which is longer than the specified 350
Created a chunk of size 1494, which is longer than the specified 350


[Document(page_content='1. What are microservices?', metadata={'source': './docs/preprocessed_with_gpt/what_is_microservice_architecture.txt'}),
 '+++',
 Document(page_content='Microservices are small, independent, and loosely coupled. A single small team of developers can write and maintain a service. Each service is a separate codebase, which can be managed by a small development team. Services can be deployed independently. A team can update an existing service without rebuilding and redeploying the entire application.', metadata={'source': './docs/preprocessed_with_gpt/what_is_microservice_architecture.txt'}),
 '+++',
 Document(page_content='rebuilding and redeploying the entire application. Services are responsible for persisting their own data or external state. Services communicate with each other by using well-defined APIs. Internal implementation details of each service are hidden from other services. Supports polyglot programming.', metadata={'source': './docs/preprocessed_wi