In [1]:
import dotenv
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
import os
import numpy 
import pandas as pd

In [3]:
import torch

In [4]:
from langchain.chains import RetrievalQA

In [5]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [6]:
# to start ollama, type in a root terminal 'ollama serve'
ollama = Ollama(base_url="http://localhost:11434", model="mistral")
llm = Ollama(model="mistral", temperature=0.5, verbose=True, 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

## Ingest the Data

In [9]:
# from langchain.document_loaders import WikipediaLoader
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
import myloadlib
from myloadlib import loadDir, loadFile, loadWiki, loadYoutube, readAPI

In [10]:
# Small file -- three documents
small_dataset = []

docs = myloadlib.loadWiki("Service (motor vehicle)", "en", 3)
small_dataset.extend(docs)

In [11]:
len(small_dataset)

3

In [12]:
## medium file

In [13]:
medium_dataset=[]

docs = myloadlib.loadFile("https://transportation.wv.gov/highways/training/TrainingDocuments/Crawfords_Auto_Repair_Guide.pdf")

In [14]:
medium_dataset.extend(docs)
len(medium_dataset)

84

In [15]:
## Large file

In [16]:
large_dataset=[]
docs = myloadlib.loadFile("https://www.providenceschools.org/cms/lib/RI01900003/Centricity/Domain/723/Automotive%20training.pdf")

In [17]:
large_dataset.extend(docs)
len(large_dataset)

978

In [18]:
docs = myloadlib.loadFile("https://www.missskirtich.com/uploads/2/3/3/7/23374820/how_to_change_your_car_oil.pdf")
large_dataset.extend(docs)

In [19]:
len(large_dataset)

982

In [20]:
docs = myloadlib.loadFile("https://techinfo.honda.com/rjanisis/pubs/OM/IT9393/IT9393O00108A.pdf")
large_dataset.extend(docs)

In [21]:
len(large_dataset)

993

## Split the Ingested Documents into Chunks

In [22]:
# Define chunking strategy
smallSplitter = TokenTextSplitter(chunk_size=100, chunk_overlap=10)
mediumSplitter = TokenTextSplitter(chunk_size=500, chunk_overlap=100)
largeSplitter = TokenTextSplitter(chunk_size=256, chunk_overlap=52)

In [23]:
small_documents = smallSplitter.split_documents(small_dataset)
medium_documents = mediumSplitter.split_documents(medium_dataset)
large_doucments = largeSplitter.split_documents(large_dataset)

In [24]:
print(len(small_documents))
print(len(medium_documents))
print(len(large_doucments))

27
99
1477


## Embed the Chunks
Vectorize them

In [25]:
# Embedding facilities
from langchain import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import OllamaEmbeddings

In [26]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

In [27]:
embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

## Store in Weaviate Vector Database

### Create Server and Client

In [28]:
# from langchain_community.vectorstores import Weaviate
import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore

In [29]:
#Initiate weaviate Connection

In [30]:
weaviate_url = "http://127.0.0.1:8080"

In [31]:
# weaviate_client.close()
weaviate_client = weaviate.connect_to_local()

In [32]:
# Check connection
weaviate_client.is_ready()

True

In [33]:
weaviate_client.get_meta()

{'hostname': 'http://[::]:8080', 'modules': {}, 'version': '1.25.1'}

### Create Databases

In [34]:
#Weaviate

In [35]:
small_w = WeaviateVectorStore.from_documents(small_documents, embedder, client=weaviate_client)

In [36]:
medium_w = WeaviateVectorStore.from_documents(medium_documents, embedder, client=weaviate_client)

In [37]:
large_w = WeaviateVectorStore.from_documents(large_doucments, embedder, client=weaviate_client)

In [38]:
#Chroma

In [45]:
import chromadb
from langchain.vectorstores import Chroma

smmall_client = chromadb.HttpClient(host='localhost', port=8000)
medium_client = chromadb.HttpClient(host='localhost', port=8001)
large_client = chromadb.HttpClient(host='localhost', port=8002)

small_chroma = Chroma.from_documents(small_documents, embedder, client=smmall_client)
medium_chroma= Chroma.from_documents(medium_documents, embedder, client=medium_client)
large_chroma= Chroma.from_documents(large_doucments, embedder, client=large_client)

## Perform Search

### Simple Search

In [46]:
query = "How do i change the oil of my car?"

In [47]:
# weaviate
docs = small_w.similarity_search(query)
print("----- Small ------")
# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")

print("")
print("")
print("----- Medium ------")
print("")
docs = medium_w.similarity_search(query)

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")
    
print("")
print("")
print("----- Large ------")
print("")
docs = large_w.similarity_search(query)

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")


----- Small ------

Document 1:
 every 30,000 to 45,000 kilometres (19,000 to 28,000 miles) – or every twelve months, whichever come...

Document 2:
 and full service: a major service is more comprehensive than a full service; although it covers all...

Document 3:
, in many states in the U.S., a car has to pass a safety inspection test every year or two years to ...

Document 4:
 United Kingdom, few parts that are not inspected on the MOT test are inspected and advised upon a S...


----- Medium ------


Document 1:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
17 Engine Oil...

Document 2:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
40 Wear Bar 
...

Document 3:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
43 Chapter 5:...

Document 4:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
15 Chapter 2:...


----- Large ------



In [48]:
# Chroma
docs = small_chroma.similarity_search(query)
print("----- Small ------")
# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")

print("")
print("")
print("----- Medium ------")
print("")
docs = medium_chroma.similarity_search(query)

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")
    
print("")
print("")
print("----- Large ------")
print("")
docs = large_chroma.similarity_search(query)

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")


----- Small ------

Document 1:
A motor vehicle service or tune-up is a series of maintenance procedures carried out at a set time i...

Document 2:
 every 30,000 to 45,000 kilometres (19,000 to 28,000 miles) – or every twelve months, whichever come...

Document 3:
 than using maintenance to keep the engine running as it should.


== Common tasks involved in maint...

Document 4:
 and full service: a major service is more comprehensive than a full service; although it covers all...


----- Medium ------


Document 1:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
17 Engine Oil...

Document 2:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
9 Oil Filter ...

Document 3:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
45 Engine Oil...

Document 4:
Crawfords Guide to Beginners Auto Maintenance & Repair  www.CrawfordsAutoService.com 
20 Windshield...


----- Large ------



### MMR

__Maximal Marginal Relevance (MMR)__ is a method used to avoid redundancy while retrieving relevant items to a query. <br>
Instead of merely retrieving the most relevant items, which can often be very similar to each other, MMR ensures a balance between relevancy and diversity.

In [53]:
# Retrieve more documents with higher diversity
# Useful if your dataset has many similar documents
retriever_w = large_w.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 6, 'lambda_mult': 0.25}
)

retriever_chroma = large_chroma.as_retriever(
)

In [50]:
query = "How do i change the oil of my car"

In [51]:
response = retriever_w.invoke(query)
response[0]

Document(page_content='HOW TO CHANGE YOUR \nCAR’S OIL  \nUnless your car’s oil filter and/or oil drain plug is impossible to reach, \nyou can save money by changing your oil and oil filter yourself (your \ncar’s manual should tell you how much oil to get and  how often to \nchange your oil ). You don’t need to be a mechanic to learn  how to \nchange your oil  – it simply takes being able to identify certain parts of \nthe car and acquiring the right equipment to do the job. From a top -\nlevel view, the oil change process involves a few procedures \nincluding: draining the existing oil out of your engine, replacing certain \nequipment (i.e. – oil filter), adding oil, ensuring the re are no leaks, and \nquality testing.  \nTo start your DIY oil change, warm up your engine for 2 or 3 minutes \nso the gook gets churned up and can flow out of the engine easily. \nYou don’t want the engine so hot that you burn yourself. When it’s \nslightly warm t o the touch, shut off the', metadata={'page

In [54]:
response = retriever_chroma.invoke(query)
response[0]

Document(page_content='HOW TO CHANGE YOUR \nCAR’S OIL  \nUnless your car’s oil filter and/or oil drain plug is impossible to reach, \nyou can save money by changing your oil and oil filter yourself (your \ncar’s manual should tell you how much oil to get and  how often to \nchange your oil ). You don’t need to be a mechanic to learn  how to \nchange your oil  – it simply takes being able to identify certain parts of \nthe car and acquiring the right equipment to do the job. From a top -\nlevel view, the oil change process involves a few procedures \nincluding: draining the existing oil out of your engine, replacing certain \nequipment (i.e. – oil filter), adding oil, ensuring the re are no leaks, and \nquality testing.  \nTo start your DIY oil change, warm up your engine for 2 or 3 minutes \nso the gook gets churned up and can flow out of the engine easily. \nYou don’t want the engine so hot that you burn yourself. When it’s \nslightly warm t o the touch, shut off the', metadata={'page


# Conversational Chain


In [None]:
# With vectorDB

In [152]:
llm = ChatOllama(model="Mistral", temperature=0)

In [153]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_chroma
)

In [154]:
response = qa_chain.invoke(
    "How do i change the oil of my car?")
print(response)

{'query': 'How do i change the oil of my car?', 'result': ' To change the oil of your car, follow these steps:\n\n1. Warm up your engine for 2 or 3 minutes to make the oil easier to drain.\n2. Shut off the engine and locate the drain plug under the oil pan at the bottom of the engine.\n3. Place a container under the drain plug that is big enough to catch the oil.\n4. Unscrew the oil drain plug, being careful not to burn yourself as the oil will be hot. The oil will drain out into the container.\n5. Remove the cap from the oil filler hole at the top of the engine and unscrew the oil filter using a wrench if necessary. Empty the oil from the filter into a drain pan and set it aside to take to a recycling center with your old oil.\n6. Check the engine oil level by removing the dipstick and wiping it clean, then re-inserting it all the way down and pulling it out to read the level. If necessary, add oil until it is even with the upper mark on the dipstick.\n7. Replace the engine oil fill c

In [155]:
#Without vectorDB

In [156]:
llm2 = ChatOllama(model="Mistral", temperature=0)

In [161]:
llm2.invoke("How do i change the oil of my car?")

AIMessage(content=" Changing the oil in your car is an essential maintenance task that can help keep your engine running smoothly and efficiently. Here are the general steps to follow when changing the oil in your car:\n\n1. Gather Your Tools and Supplies: You will need a few tools and supplies, including an oil filter wrench, a drain pan, a funnel, a new oil filter, new oil, a socket wrench or adjustable wrench, and possibly a torque wrench (depending on your vehicle).\n\n2. Warm Up the Engine: Start the engine and let it run for about 5-10 minutes to help the oil flow freely. Be sure to park in a safe location with good drainage and away from any open flames or sparks.\n\n3. Drain the Old Oil: Locate the oil drain plug, which is usually found at the bottom of the engine near the front. Place a drain pan underneath the vehicle to catch the old oil. Using a socket wrench or adjustable wrench, carefully remove the drain plug and allow the oil to drain out completely.\n\n4. Remove the Ol

## Conclusion
Seems like adding our own vectorized text did not give us a better answer...

## Building Knowledge Graph

We will be using Diffbot - a set of ML APIs to transform text into knowledge graph. We get access to it through LangChain platfirm. <br>
__DiffbotGraphTransformer__ extracts entities and relationships from unstructured text documents and outputs a structured data object __GraphDocument__, which can be used to populate a graph database. 

In [106]:
raw_documents = []
rd = WikipediaLoader(query="List of automobile sales by model").load()
raw_documents.extend(rd)


In [57]:
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer

In [58]:
import getpass

In [101]:
diffbot_api_token = getpass.getpass("Enter API Token: ")

Enter API Token:  ········


In [102]:
diffbot_nlp = DiffbotGraphTransformer(diffbot_api_token)

In [108]:
graph_document_car_makes_models = diffbot_nlp.convert_to_graph_documents(raw_documents)

### Connect to Neo4j

In [109]:
from langchain_community.graphs import Neo4jGraph

In [110]:
# DB URI and authentication
url = "bolt://localhost:7687"
username = "neo4j"
password = "mysecretpassword"

In [111]:
graph = Neo4jGraph(url=url, username=username, password=password)

Enable tracemalloc to get the object allocation traceback.
  unclosed_resource_warn(self)


The __GraphDocuments__ can be loaded into a knowledge graph using the __add_graph_documents__ method.

In [116]:
graph.add_graph_documents(graph_documents)

In [None]:
graph.schema

'Node properties are the following:\nOrganization {name: STRING, id: STRING, foundingDate: STRING, productType: STRING},Person {id: STRING, name: STRING, positionHeld: STRING, dateOfBirth: STRING, age: STRING, dateOfDeath: STRING, causeOfDeath: STRING, academicDegree: STRING},Location {name: STRING, id: STRING},Skill {name: STRING, id: STRING},Disease {id: STRING, name: STRING},Disaster {id: STRING, name: STRING},Award {id: STRING, name: STRING}\nRelationship properties are the following:\nEMPLOYEE_OR_MEMBER_OF {evidence: STRING, isCurrent: STRING, startTime: STRING, positionHeld: STRING, isNotCurrent: STRING, endTime: STRING},FOUNDED_BY {evidence: STRING},PERSON_LOCATION {evidence: STRING, isNotCurrent: STRING, endTime: STRING, startTime: STRING, isCurrent: STRING},CHIEF_EXECUTIVE_OFFICER {isNotCurrent: STRING, startTime: STRING, evidence: STRING, isCurrent: STRING},INDUSTRY {evidence: STRING},INTERESTED_IN {evidence: STRING},PARENT_ORGANIZATION {isCurrent: STRING, evidence: STRING, i

## Conclusion

We found that between chroma and Weaviate, there was not much difference in performance, and the answers were almost the same, therefore we cannot recommend one over the other.

Another interesting thing is that the mistral model did not get any any better by adding our own data as the answer to the same question gave us a better answer.