In [37]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import os

In [14]:
!ls

README.md               task02.ipynb
task01.ipynb            wardley-map-rag-app.pdf


In [15]:
loader = PyPDFLoader("wardley-map-rag-app.pdf")

In [16]:
pages = loader.load_and_split()

In [17]:
pages

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content="About the Author\nSimon Wardley is a researcher for the Leading Edge forum, former\nManager of Software Services at Canonical, and former CEO of\nFotango. He has written this book Creative Commons Attribution\nShare-Alike 4.0 to share his experiences learning the untold lessons of\nbusiness strategy. The license for this book can be found at\nhttps://creativecommons.org/licenses/by-sa/4.0/. You can follow\nSimon on Twitter @swardley.\nThis edition of Simon's book was compiled on December 23, 2020 for\nLearnWardleyMapping.com by Ben Mosior of Hired Thought. The\ncover image is provided by Hired Thought via the Adobe Stock\nStandard License."),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 2, 'page_label': '3'},

In [18]:
len(pages)

17

In [19]:
pages[0].page_content

"About the Author\nSimon Wardley is a researcher for the Leading Edge forum, former\nManager of Software Services at Canonical, and former CEO of\nFotango. He has written this book Creative Commons Attribution\nShare-Alike 4.0 to share his experiences learning the untold lessons of\nbusiness strategy. The license for this book can be found at\nhttps://creativecommons.org/licenses/by-sa/4.0/. You can follow\nSimon on Twitter @swardley.\nThis edition of Simon's book was compiled on December 23, 2020 for\nLearnWardleyMapping.com by Ben Mosior of Hired Thought. The\ncover image is provided by Hired Thought via the Adobe Stock\nStandard License."

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)

In [21]:
chunks = text_splitter.split_documents(pages)

In [22]:
chunks

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content='About the Author\nSimon Wardley is a researcher for the Leading Edge forum, former\nManager of Software Services at Canonical, and former CEO of'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content='Fotango. He has written this book Creative Commons Attribution\nShare-Alike 4.0 to share his experiences learning the untold lessons of\nbusiness strategy. The license for this book can be found at'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content="https://creativecommons.org/licenses/by-sa/4.0/. You can follow\nSimon on Twitter @swardley.\nThis editio

In [23]:
len(chunks)

127

In [24]:
chunks[0]

Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content='About the Author\nSimon Wardley is a researcher for the Leading Edge forum, former\nManager of Software Services at Canonical, and former CEO of')

In [25]:
chunks[1]

Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content='Fotango. He has written this book Creative Commons Attribution\nShare-Alike 4.0 to share his experiences learning the untold lessons of\nbusiness strategy. The license for this book can be found at')

In [26]:
chunks[2]

Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'wardley-map-rag-app.pdf', 'total_pages': 18, 'page': 1, 'page_label': '2'}, page_content="https://creativecommons.org/licenses/by-sa/4.0/. You can follow\nSimon on Twitter @swardley.\nThis edition of Simon's book was compiled on December 23, 2020 for")

In [28]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [31]:
texts = [chunk.page_content for chunk in chunks]
embed_docs = embeddings.embed_documents(texts)

In [32]:
len(embed_docs)

127

In [33]:
embed_docs[0]

[-0.03129297494888306,
 0.02223833277821541,
 -0.0054457904770970345,
 -0.036576204001903534,
 -0.01723145693540573,
 0.00030175369465723634,
 0.03043140284717083,
 0.014118415303528309,
 -0.02157183364033699,
 0.029277220368385315,
 0.0014122478896752,
 0.017654115334153175,
 -0.020320113748311996,
 -0.034072767943143845,
 0.0016256091184914112,
 0.04837812855839729,
 -0.010143800638616085,
 0.009566709399223328,
 -0.02930973283946514,
 0.003913654014468193,
 0.013728269375860691,
 -0.04203825071454048,
 -0.01564648747444153,
 0.014744275249540806,
 0.038006741553545,
 0.0022372445091605186,
 0.02275852859020233,
 0.006835686508566141,
 -0.011606848798692226,
 0.03807176649570465,
 -0.008883954025804996,
 -0.003946166019886732,
 0.006494308356195688,
 -0.007766347844153643,
 0.0017769939731806517,
 -0.024855563417077065,
 -0.012175812385976315,
 0.024367880076169968,
 -0.003001280827447772,
 0.03400774300098419,
 -1.9304108718642965e-05,
 0.006677189376205206,
 -0.032951097935438156,


In [34]:
len(embed_docs[0])

3072

In [39]:
vectorstore = Chroma.from_texts(texts=texts, embedding=embeddings)

In [40]:
vectorstore.similarity_search('Situational awareness', 2)

[Document(id='dd72c80e-64d8-427d-9817-39f9f7af3ea3', metadata={}, page_content='in the importance of maps and situational awareness. Through\nmisinformation and miscalculation, 1,700 Union troops were caught in'),
 Document(id='90399ef4-ea8a-4b1f-a957-6b03141ac5b4', metadata={}, page_content='battle at hand”. It is context specific i.e. these techniques are known to\ndepend upon the landscape and your purpose.\nI started to consider strategy in terms of these five factors. I')]

In [41]:
vectorstore.similarity_search('Themistocles SWOT', 2)

[Document(id='7eaa8d56-775b-4434-85c5-885d93c8052f', metadata={}, page_content='could defend around Thebes or Athens itself. However, Themistocles\nunderstood the environment and decided to block off the straits of'),
 Document(id='98b68b0f-19b7-4824-b721-408b9c753eb3', metadata={}, page_content='most famously cited example is the ancient battle of the pass of\nThermopylae. In 480 BC, the Athenian general Themistocles faced a\nsignificant foe in Xerxes and the Persian army. He had choices; he')]