<a href="https://colab.research.google.com/github/TirendazAcademy/LangChain-Tutorials/blob/main/Creating-a-Vector-Store.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a Vector Store (Vector Database)

In [1]:
%pip install -q langchain openai chromadb tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m396.4/396.4 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l

In [2]:
import openai
import os

os.environ["OPENAI_API_KEY"] = "..."
openai.api_key = os.getenv("OPENAI_API_KEY")

## Collect data that we want to use to answer the users’ questions

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ApbpqcZUMF-YaA6DbnVGww.png)

In [3]:
import requests
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

# URL of the Wikipedia page to scrape
url = 'https://en.wikipedia.org/wiki/Prime_Minister_of_the_United_Kingdom'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the text on the page
text = soup.get_text()
text = text.replace('\n', '')

# Open a new file called 'output.txt' in write mode and store the file object in a variable
with open('output.txt', 'w', encoding='utf-8') as file:
    # Write the string to the file
    file.write(text)

In [4]:
text[:100]

'Prime Minister of the United Kingdom - WikipediaJump to contentMain menuMain menumove to sidebarhide'

## Load the data and define how you want to split the data into text chunks

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*f5mcqjHkiz9QX63dpQnzxg.png)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# load the document
with open('./output.txt', encoding='utf-8') as f:
    text = f.read()

# define the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
)

texts = text_splitter.create_documents([text])

In [6]:
for text in texts[:3]:
  print(text)

page_content='Prime Minister of the United Kingdom - WikipediaJump to contentMain menuMain menumove to sidebarhide\t\tNavigation\tMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\t\tContribute\tHelpLearn to editCommunity portalRecent changesUpload fileLanguagesLanguage links are at the top of the page across from the title.SearchSearchCreate accountLog inPersonal tools Create account Log inPages for logged out editors learn moreContributionsTalkContentsmove to' metadata={}
page_content="tools Create account Log inPages for logged out editors learn moreContributionsTalkContentsmove to sidebarhide(Top)1History2Authority, powers and constraints3Constitutional background4Modern premiershipToggle Modern premiership subsection4.1Appointment4.2Prime Minister's Office4.3Prime Minister's Questions4.4Security and transport4.5International role4.6Deputy4.6.1Succession4.7Resignation5Precedence, privileges and form of address6Retirement honours7Public Duty Costs Allowanc

## Define the Embeddings Model you want to use to calculate the embeddings for your text chunks and store them in a vector store

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ydbixXRwfgMYVdpctYTdew.png)

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# define the embeddings model
embeddings = OpenAIEmbeddings()

# use the text chunks and the embeddings model to fill our vector store
db = Chroma.from_documents(texts, embeddings)

## Calculate the embeddings for the user’s question, find similar text chunks in our vector store and use them to build our prompt

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*r2n4uA-ZlxZatnlhTVwv5Q.png)

In [8]:
from langchain.llms import OpenAI
from langchain import PromptTemplate

users_question = "Who is the current Prime Minister of the UK?"

# use our vector store to find similar text chunks
results = db.similarity_search(
    query=users_question,
    n_results=5
)

# define the prompt template
template = """
You are a chat bot who loves to help people! Given the following context sections, answer the
question using only the given context. If you are unsure and the answer is not
explicitly writting in the documentation, say "Sorry, I don't know how to help with that."

Context sections:
{context}

Question:
{users_question}

Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "users_question"])

# fill the prompt template
prompt_text = prompt.format(context = results, users_question = users_question)

# ask the defined LLM
llm = OpenAI(temperature=1)
llm(prompt_text)

'The current Prime Minister of the UK is Rishi Sunak, since 25 October 2022.'

### Resources

- [All You Need to Know to Build Your First LLM App](https://towardsdatascience.com/all-you-need-to-know-to-build-your-first-llm-app-eb982c78ffac)