In [None]:
import os
from langchain_openai import OpenAIEmbeddings  # You can use other embedding models as well
from tqdm import tqdm
import re
from langchain.document_loaders import UnstructuredWordDocumentLoader
from qdrant_client import QdrantClient
import openai
from qdrant_client.http import models


In [None]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small") 
input_dir = os.path.join("data") 
inserted_doc_file_path = "Introduction-to-Machine-Learning.docx"
inserted_doc_path = os.path.join(input_dir, inserted_doc_file_path)
inserted_doc_path



In [None]:

def load_doc_with_langchain(file_path):
    loader = UnstructuredWordDocumentLoader(file_path,mode="elements")
    documents = loader.load()
    
    # The content of the file is in the 'page_content' attribute of the first (and usually only) document
    return documents

# Usage example
doc_content = load_doc_with_langchain(inserted_doc_path)
print(doc_content)

In [None]:
doc_dict_list = []
# exclude the first 6 pages of the document
is_before_page_6 = True
for i in range(len(doc_content)):
    current_element = doc_content[i].dict()
    if is_before_page_6:
        if "6" == current_element["page_content"]:
            is_before_page_6 = False
        else:
            continue
    current_element["metadata"]["element_id"] = i
    doc_dict_list.append(current_element)

print(doc_dict_list[:4])

In [None]:
# assign page numbers to the elements
last_page = 0
for element in doc_dict_list:
    # check if the element is a number
    if re.match(r"^\d+$", element["page_content"]):
        last_page = int(element["page_content"])
        # delete the page number from the content
        element["page_content"] = ""
    if last_page != 0:
        element["metadata"]["page_number"] = last_page

# remove empty elements
doc_dict_list = [element for element in doc_dict_list if element["page_content"] != ""]
print(doc_dict_list[:41])


In [None]:
titles = [element for element in doc_dict_list if element["metadata"]["category"] == "Title"]
part_titles = [element for element in titles if "Part" in element["page_content"]]
chapter_titles = [element for element in titles if "Chapter" in element["page_content"]]
sub_titles = [element for element in titles if element not in part_titles and element not in chapter_titles]

non_title_elements = [element for element in doc_dict_list if element not in titles]

previous_part = part_titles[0]
previous_chapter = chapter_titles[0]
previous_sub_title = sub_titles[0]

for element in doc_dict_list:
    # assign the previous part to the element
    if element in part_titles:
        previous_part = element
    elif element in chapter_titles:
        previous_chapter = element
    elif element in sub_titles:
        previous_sub_title = element
    else:
        element["metadata"]["part"] = previous_part["page_content"]
        element["metadata"]["chapter"] = previous_chapter["page_content"]
        if element["metadata"]["element_id"] > previous_sub_title["metadata"]["element_id"] and previous_sub_title["metadata"]["element_id"] > previous_chapter["metadata"]["element_id"]:
            element["metadata"]["sub_title"] = previous_sub_title["page_content"]
        else:
            element["metadata"]["sub_title"] = ""


non_title_elements

In [None]:

edited_elements = []
# if the element has the same part, chapter and sub_title as the previous element, add the page_content to the previous element
previous_element_id = -1
for element in non_title_elements:
    edited_element = element.copy()
    if previous_element_id != -1 and element["metadata"]["part"] == edited_elements[previous_element_id]["metadata"]["part"] \
        and element["metadata"]["chapter"] == edited_elements[previous_element_id]["metadata"]["chapter"] \
        and element["metadata"]["sub_title"] == edited_elements[previous_element_id]["metadata"]["sub_title"] \
        and len(edited_elements[previous_element_id]["page_content"]) < 1500:
        edited_elements[previous_element_id]["page_content"] += f" {element['page_content']}" 
    else:
#         edited_element["page_content"] = f"""{edited_element["metadata"]["part"]}
# {edited_element["metadata"]["chapter"]}
# {edited_element["metadata"]["sub_title"]} 
# {edited_element["page_content"]}
# """
        edited_elements.append(edited_element)
        previous_element_id += 1

edited_elements[:5]


In [None]:
print(len(edited_elements))
print(edited_elements[-1])

In [None]:
# get the size of the longest element
# print the longest element
longest_element = max(edited_elements, key=lambda x: len(x["page_content"]))
print(longest_element["page_content"])
print(len(longest_element["page_content"]))


# print the shortest element
shortest_element = min(edited_elements, key=lambda x: len(x["page_content"]))
print(shortest_element["page_content"])
print(len(shortest_element["page_content"]))

In [None]:
qdrant_client = QdrantClient(
    url=os.environ["QDRANT_URL"], 
    api_key=os.environ["QDRANT_API_KEY"],
)

print(qdrant_client.get_collections())

In [None]:

openai_client = openai.Client(
    api_key=os.environ["OPENAI_API_KEY"]
)

embedding_model_name = "text-embedding-3-small"


In [None]:

openai.api_key = os.environ["OPENAI_API_KEY"]

collection_name = "machine_learning_course_doc"

# Create a new collection (if it doesn't exist)
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

In [None]:
# Insert data into Qdrant
for idx, item in tqdm(enumerate(edited_elements), total=len(edited_elements)):
    # Get embedding for the text
    embedding = openai_client.embeddings.create(input=item["page_content"], model=embedding_model_name)
    
    # Create a point to insert
    point = models.PointStruct(
        id=idx,
        vector=embedding.data[0].embedding,
        payload={
            "page_content": item["page_content"],
            "metadata": {
                "page_number": item["metadata"]["page_number"],
                "part": item["metadata"]["part"],
                "chapter": item["metadata"]["chapter"],
                "sub_title": item["metadata"]["sub_title"],
                "filename": item["metadata"]["filename"],
                "element_id": item["metadata"]["element_id"],
                "page_number": item["metadata"]["page_number"]
            }
        }
    )
    # Insert the point into the collection
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[point]
    )

print("Data inserted successfully!")

In [None]:
from langchain_qdrant import Qdrant

retriever = Qdrant(
    client=qdrant_client, 
    collection_name=collection_name,
    embeddings= OpenAIEmbeddings(model=embedding_model_name),
).as_retriever(search_kwargs={"k": 5})

In [None]:
retriever.get_relevant_documents(query="What are the types of machine learning?", collection_name=collection_name)