In [46]:
import os
from langchain_openai import OpenAIEmbeddings  # You can use other embedding models as well
from tqdm import tqdm
import re
from langchain.document_loaders import UnstructuredWordDocumentLoader
from qdrant_client import QdrantClient
import openai
from qdrant_client.http import models


In [47]:
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small") 
input_dir = os.path.join("data") 
inserted_doc_file_path = "Introduction-to-Machine-Learning.docx"
inserted_doc_path = os.path.join(input_dir, inserted_doc_file_path)
inserted_doc_path



'data\\Introduction-to-Machine-Learning.docx'

In [48]:

def load_doc_with_langchain(file_path):
    loader = UnstructuredWordDocumentLoader(file_path,mode="elements")
    documents = loader.load()
    
    # The content of the file is in the 'page_content' attribute of the first (and usually only) document
    return documents

# Usage example
doc_content = load_doc_with_langchain(inserted_doc_path)
print(doc_content)



In [49]:
doc_dict_list = []
# exclude the first 6 pages of the document
is_before_page_6 = True
for i in range(len(doc_content)):
    current_element = doc_content[i].dict()
    if is_before_page_6:
        if "6" == current_element["page_content"]:
            is_before_page_6 = False
        else:
            continue
    current_element["metadata"]["element_id"] = i
    doc_dict_list.append(current_element)

print(doc_dict_list[:4])

[{'page_content': '6', 'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx', 'category_depth': 0, 'last_modified': '2024-07-15T21:25:45', 'page_number': 4, 'languages': ['eng'], 'file_directory': 'data', 'filename': 'Introduction-to-Machine-Learning.docx', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'UncategorizedText', 'element_id': 51}, 'type': 'Document'}, {'page_content': 'Part 1: Introduction to Machine Learning', 'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx', 'category_depth': 0, 'last_modified': '2024-07-15T21:25:45', 'page_number': 4, 'languages': ['eng'], 'file_directory': 'data', 'filename': 'Introduction-to-Machine-Learning.docx', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 52}, 'type': 'Document'}, {'page_content': 'Chapter 1: What is Machine Learning?', 'metadata': {'source': 'data\\Introduction-to-Machi

In [50]:
# assign page numbers to the elements
last_page = 0
for element in doc_dict_list:
    # check if the element is a number
    if re.match(r"^\d+$", element["page_content"]):
        last_page = int(element["page_content"])
        # delete the page number from the content
        element["page_content"] = ""
    if last_page != 0:
        element["metadata"]["page_number"] = last_page

# remove empty elements
doc_dict_list = [element for element in doc_dict_list if element["page_content"] != ""]
print(doc_dict_list[:41])


[{'page_content': 'Part 1: Introduction to Machine Learning', 'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx', 'category_depth': 0, 'last_modified': '2024-07-15T21:25:45', 'page_number': 6, 'languages': ['eng'], 'file_directory': 'data', 'filename': 'Introduction-to-Machine-Learning.docx', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 52}, 'type': 'Document'}, {'page_content': 'Chapter 1: What is Machine Learning?', 'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx', 'category_depth': 1, 'last_modified': '2024-07-15T21:25:45', 'page_number': 6, 'languages': ['eng'], 'parent_id': 'd1ebddfee7a1485ac447ffe521a987fc', 'file_directory': 'data', 'filename': 'Introduction-to-Machine-Learning.docx', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 53}, 'type': 'Document'}, {'page_content': 'Machine learning i

In [51]:
titles = [element for element in doc_dict_list if element["metadata"]["category"] == "Title"]
part_titles = [element for element in titles if "Part" in element["page_content"]]
chapter_titles = [element for element in titles if "Chapter" in element["page_content"]]
sub_titles = [element for element in titles if element not in part_titles and element not in chapter_titles]

non_title_elements = [element for element in doc_dict_list if element not in titles]

previous_part = part_titles[0]
previous_chapter = chapter_titles[0]
previous_sub_title = sub_titles[0]

for element in doc_dict_list:
    # assign the previous part to the element
    if element in part_titles:
        previous_part = element
    elif element in chapter_titles:
        previous_chapter = element
    elif element in sub_titles:
        previous_sub_title = element
    else:
        element["metadata"]["part"] = previous_part["page_content"]
        element["metadata"]["chapter"] = previous_chapter["page_content"]
        if element["metadata"]["element_id"] > previous_sub_title["metadata"]["element_id"] and previous_sub_title["metadata"]["element_id"] > previous_chapter["metadata"]["element_id"]:
            element["metadata"]["sub_title"] = previous_sub_title["page_content"]
        else:
            element["metadata"]["sub_title"] = ""


non_title_elements

[{'page_content': 'Machine learning is a branch of artificial intelligence that focuses on creating algorithms that can learn from data and make predictions or decisions based on that data. The goal of machine learning is to enable machines to learn from experience, so that they can improve their performance over time. In this chapter, we will explore the fundamentals of machine learning, including its definition, history, and key concepts.',
  'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx',
   'category_depth': 0,
   'last_modified': '2024-07-15T21:25:45',
   'page_number': 6,
   'languages': ['eng'],
   'parent_id': 'f6029a4b164053ad7f7b399d9ff7cca1',
   'file_directory': 'data',
   'filename': 'Introduction-to-Machine-Learning.docx',
   'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
   'category': 'NarrativeText',
   'element_id': 54,
   'part': 'Part 1: Introduction to Machine Learning',
   'chapter': 'Chapter 1: What is

In [52]:

edited_elements = []
# if the element has the same part, chapter and sub_title as the previous element, add the page_content to the previous element
previous_element_id = -1
for element in non_title_elements:
    edited_element = element.copy()
    if previous_element_id != -1 and element["metadata"]["part"] == edited_elements[previous_element_id]["metadata"]["part"] \
        and element["metadata"]["chapter"] == edited_elements[previous_element_id]["metadata"]["chapter"] \
        and element["metadata"]["sub_title"] == edited_elements[previous_element_id]["metadata"]["sub_title"] \
        and len(edited_elements[previous_element_id]["page_content"]) < 1500:
        edited_elements[previous_element_id]["page_content"] += f" {element['page_content']}" 
    else:
#         edited_element["page_content"] = f"""{edited_element["metadata"]["part"]}
# {edited_element["metadata"]["chapter"]}
# {edited_element["metadata"]["sub_title"]} 
# {edited_element["page_content"]}
# """
        edited_elements.append(edited_element)
        previous_element_id += 1

edited_elements[:5]


[{'page_content': 'Machine learning is a branch of artificial intelligence that focuses on creating algorithms that can learn from data and make predictions or decisions based on that data. The goal of machine learning is to enable machines to learn from experience, so that they can improve their performance over time. In this chapter, we will explore the fundamentals of machine learning, including its definition, history, and key concepts.',
  'metadata': {'source': 'data\\Introduction-to-Machine-Learning.docx',
   'category_depth': 0,
   'last_modified': '2024-07-15T21:25:45',
   'page_number': 6,
   'languages': ['eng'],
   'parent_id': 'f6029a4b164053ad7f7b399d9ff7cca1',
   'file_directory': 'data',
   'filename': 'Introduction-to-Machine-Learning.docx',
   'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
   'category': 'NarrativeText',
   'element_id': 54,
   'part': 'Part 1: Introduction to Machine Learning',
   'chapter': 'Chapter 1: What is

In [53]:
print(len(edited_elements))
print(edited_elements[-1])

116
{'page_content': 'In conclusion, machine learning is a rapidly growing field that has transformed various industries and domains. From image and speech recognition to natural language processing, machine learning has demonstrated its effectiveness in solving complex problems and providing insights from vast amounts of data. In this book, we covered the basics of machine learning, including its key concepts, types of learning, and popular algorithms. We also explored the practical applications of machine learning, such as predictive maintenance, fraud detection, and recommendation systems. Moreover, we delved into the details of some popular machine learning algorithms, including linear regression, decision trees, and neural networks. We also discussed the importance of model selection and evaluation, as well as common techniques for regularization and hyperparameter tuning. Lastly, we covered some advanced topics such as deep learning and unsupervised learning, which have shown gre

In [54]:
# get the size of the longest element
# print the longest element
longest_element = max(edited_elements, key=lambda x: len(x["page_content"]))
print(longest_element["page_content"])
print(len(longest_element["page_content"]))


# print the shortest element
shortest_element = min(edited_elements, key=lambda x: len(x["page_content"]))
print(shortest_element["page_content"])
print(len(shortest_element["page_content"]))

Deep learning architectures are composed of various types of neural networks, each uniquely designed for specific tasks. The architecture of a deep neural network is often characterized by the number of layers it contains, and the number of nodes within each layer. As the number of layers and nodes increase, the network's capacity to model complex patterns in the data increases as well. One common type of deep learning architecture is the deep feedforward network, also known as multi-layer perceptrons. These networks consist of input, hidden, and output layers, where the hidden layers perform computations on the input data to transform it into a more useful representation. Deep feedforward networks are commonly used for tasks such as regression and classification. Convolutional neural networks, on the other hand, are designed for image and video processing tasks. These networks use convolutional layers and pooling layers to extract features from the input data, allowing them to detect 

In [55]:
qdrant_client = QdrantClient(
    url=os.environ["QDRANT_URL"], 
    api_key=os.environ["QDRANT_API_KEY"],
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='cod_fiscal_article_chunks'), CollectionDescription(name='machine_learning_course_doc'), CollectionDescription(name='prietenii_contabilitatii')]


In [56]:

openai_client = openai.Client(
    api_key=os.environ["OPENAI_API_KEY"]
)

embedding_model_name = "text-embedding-3-small"


In [57]:

openai.api_key = os.environ["OPENAI_API_KEY"]

collection_name = "machine_learning_course_doc"

# Create a new collection (if it doesn't exist)
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
)

  qdrant_client.recreate_collection(


True

In [58]:
# Insert data into Qdrant
for idx, item in tqdm(enumerate(edited_elements), total=len(edited_elements)):
    # Get embedding for the text
    embedding = openai_client.embeddings.create(input=item["page_content"], model=embedding_model_name)
    
    # Create a point to insert
    point = models.PointStruct(
        id=idx,
        vector=embedding.data[0].embedding,
        payload={
            "page_content": item["page_content"],
            "metadata": {
                "page_number": item["metadata"]["page_number"],
                "part": item["metadata"]["part"],
                "chapter": item["metadata"]["chapter"],
                "sub_title": item["metadata"]["sub_title"],
                "filename": item["metadata"]["filename"],
                "element_id": item["metadata"]["element_id"],
                "page_number": item["metadata"]["page_number"]
            }
        }
    )
    # Insert the point into the collection
    qdrant_client.upsert(
        collection_name=collection_name,
        points=[point]
    )

print("Data inserted successfully!")

100%|██████████| 116/116 [00:59<00:00,  1.95it/s]

Data inserted successfully!





In [59]:
from langchain_qdrant import Qdrant

retriever = Qdrant(
    client=qdrant_client, 
    collection_name=collection_name,
    embeddings= OpenAIEmbeddings(model=embedding_model_name),
).as_retriever(search_kwargs={"k": 5})

In [60]:
retriever.get_relevant_documents(query="What are the types of machine learning?", collection_name=collection_name)

[Document(page_content='Machine learning is a powerful tool that enables computers to learn from data and make predictions or decisions without being explicitly programmed. It has revolutionized various fields such as finance, healthcare, and marketing, among others. Machine learning can be broadly classified into three main categories: supervised learning, unsupervised learning, and reinforcement learning. Supervised learning trains a machine learning model on labeled data, where inputs and outputs are known. The model learns to map inputs to outputs by minimizing the difference between predicted and actual output. Examples include image classification, speech recognition, and natural language processing. Unsupervised learning trains a machine learning model on unlabeled data, where inputs are provided but outputs are not known. The goal is to find patterns or structure in the data without prior knowledge of the labels. Examples include clustering, anomaly detection, and dimensionalit