In [2]:
import os
os.environ['USER_AGENT'] = 'myagent'

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings # other embeddings available 
from langchain_community.vectorstores import Chroma

In [5]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

## Vector database from a list of strings

In [6]:
data = ["I am Rito", "I am a male", "I work as a data scientist", "My favourite game is cricket"]

In [7]:
# dir(Chroma)

In [8]:
#storing the data in Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

# create embeddings
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# database
vector_database = Chroma.from_texts(texts=data, embedding=embedding) # takes a list of strings

  from tqdm.autonotebook import tqdm, trange


In [9]:
vector_database

<langchain_community.vectorstores.chroma.Chroma at 0x20d53a02650>

## Vector database from a large pice of text( string)

In [10]:
data = r"I am Rito. I am a male. I work as a data scientist. My favourite game is cricket."

In [11]:
text_splitter = CharacterTextSplitter(separator='.', chunk_size=10, chunk_overlap=0)
data = text_splitter.split_text(data)

Created a chunk of size 12, which is longer than the specified 10
Created a chunk of size 27, which is longer than the specified 10


In [12]:
# dir(CharacterTextSplitter)

In [13]:
data

['I am Rito',
 'I am a male',
 'I work as a data scientist',
 'My favourite game is cricket']

In [14]:
#storing the data in Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

# create embeddings
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# database
vector_database = Chroma.from_texts(texts=data, embedding=embedding) # takes a list of strings

In [15]:
vector_database

<langchain_community.vectorstores.chroma.Chroma at 0x20d53a02a40>

## From a text document

In [16]:
raw_documents = TextLoader('data_example.txt').load()
text_splitter = CharacterTextSplitter(separator = '.', chunk_size=10, chunk_overlap=0)
data = text_splitter.split_documents(raw_documents)

Created a chunk of size 12, which is longer than the specified 10
Created a chunk of size 27, which is longer than the specified 10


In [17]:
data

[Document(metadata={'source': 'data_example.txt'}, page_content='I am Rito'),
 Document(metadata={'source': 'data_example.txt'}, page_content='I am a male'),
 Document(metadata={'source': 'data_example.txt'}, page_content='I work as a data scientist'),
 Document(metadata={'source': 'data_example.txt'}, page_content='My favourite game is cricket')]

In [18]:
#storing the data in Vector Store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

# create embeddings
embedding = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# database
vector_database = Chroma.from_documents(documents=data, embedding=embedding) # takes a list of text documents