In [None]:
!pip -q install huggingface chromadb transformers langchain

In [None]:
!pip install pgvector openai

In [2]:
from langchain.embeddings import OpenAIEmbeddings

In [9]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

In [10]:
spaceLoad = TextLoader('/content/linux_play.txt')

In [60]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
documents = spaceLoad.load()

In [61]:
recurCharSplitter = RecursiveCharacterTextSplitter(length_function=len,
                                                   chunk_size=100,
                                                   chunk_overlap=0)

In [63]:
with open('/content/linux_play.txt') as f:
    lin_text = f.read()

char_documents = recurCharSplitter.split_documents(documents)

In [65]:
len(char_documents)

151

# Now work on embedding

In [19]:
from langchain.vectorstores import Chroma

In [None]:
persist = 'chroma_db'

plain_chroma = Chroma.from_documents(documents=char_documents,
                                     embeddings=OpenAIEmbeddings(),
                                     persist_directory=persist)

In [21]:
reload_chroma = Chroma(persist_directory=persist,
                                     embedding_function=OpenAIEmbeddings())



In [24]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [None]:
# Supplying a persist_directory will store the embeddings on disk

from langchain.vectorstores import FAISS

faiss= 'faiss_db'

plain_faiss = FAISS.from_documents(documents=char_documents,
                                     embedding=OpenAIEmbeddings())

plain_faiss.save_local(faiss)

In [27]:
reload_faiss = FAISS.load_local(faiss,
                                     embeddings=OpenAIEmbeddings())

In [None]:
# Supplying a persist_directory will store the embeddings on disk
import pinecone 
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key="YOUR_API_KEY",  # find at app.pinecone.io
    environment="YOUR_ENV"  # next to api key in console
)

pindex = 'test-index'

plain_pine = Pinecone.from_documents(documents=char_documents,
                                     embeddings=OpenAIEmbeddings(),
                                     index_name=pindex)

reload_pine = Pinecone.from_existing_index(pindex, 
                                           embedding=OpenAIEmbeddings())

In [66]:
import os

os.environ['OPENAI_API_KEY'] = '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'

In [53]:
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "ec2-23-20-61-52.compute-1.amazonaws.com"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database=os.environ.get("PGVECTOR_DATABASE", "postgres"),
    user=os.environ.get("PGVECTOR_USER", "postgres"),
    password=os.environ.get("PGVECTOR_PASSWORD", "1234"),
)

In [68]:
db = PGVector.from_documents(
    embedding=OpenAIEmbeddings(),
    documents=char_documents,
    collection_name="test_db",
    connection_string=CONNECTION_STRING,
)

In [80]:
query = "For which commands there will be text files?"
docs_with_score = db.similarity_search_with_score(query)

In [81]:
docs_with_score

[(Document(page_content='The fish will be already installed\nThere will be a text file for\n  a) grep commands', metadata={'source': '/content/linux_play.txt'}),
  0.555393186572163),
 (Document(page_content='c) csv files for awk / cut command', metadata={'source': '/content/linux_play.txt'}),
  0.5965337210434689),
 (Document(page_content='using the license file text document, and existing python scripts', metadata={'source': '/content/linux_play.txt'}),
  0.6141131821960744),
 (Document(page_content='Below command will keep the blank lines too, can you think why?', metadata={'source': '/content/linux_play.txt'}),
  0.6274188621032458)]

In [72]:
reloadDB = PGVector(collection_name='test_db',
                    embedding_function=OpenAIEmbeddings(),
                    connection_string=CONNECTION_STRING)

In [75]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

pgRetriever = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(temperature=0), 
                                                          chain_type="stuff", 
                                                          retriever=reloadDB.as_retriever())

In [82]:
pgRetriever({"question":query},return_only_outputs=True)

{'answer': ' There will be text files for grep commands, csv files for awk/cut commands, and a license file text document.\n',
 'sources': '/content/linux_play.txt'}