In [None]:
!pip -q install langchain langchain-community tiktoken chromadb pypdf sentence_transformers==2.2.2 InstructorEmbedding google-generativeai langchain-google-genai

In [None]:
!pip show langchain

Name: langchain
Version: 0.2.2
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [None]:
!wget -q https://www.dropbox.com/s/zoj9rnm7oyeaivb/new_papers.zip
!unzip -q new_papers.zip -d new_papers

# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files - PDFs
- ChromaDB - with more meta data?
- Source info
- gpt-3.5-turbo API
- HuggingFace Embeddings
- Instuctor Embeddings


## Setting up LangChain


In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [1]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
# from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
# from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings

  from tqdm.autonotebook import trange


## Load multiple and process documents

In [3]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
# loader = DirectoryLoader('./new_papers/new_papers/', glob="./*.pdf", loader_cls=PyPDFLoader)
loader = PyPDFLoader('./docs/competitor_data.pdf')

documents = loader.load()

In [4]:
len(documents)

49

In [5]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [6]:
len(texts)

147

In [7]:
texts[20]

Document(page_content="without significant modifications, making the transition easier [ 2][4].  \n• The company's Titan Fuel Forge and Faraday Reactor are designed to be more efficient \nand cost -effective than traditional methods of producing synthetic fuels [ 2][3].  \n• Their process avoids expensive distillation and electrolysis steps by using a novel carbon \nnanotube membrane to separate alcohols from water  \n• They claim to operate at room temperature and atmospheric pressure, potentially \nreducing energy costs . If successful, it could provide a way to produce carbon -neutral \nliquid fuels compatible with existing infrastructure  \n  \nCons:  \n• Missed Targets : Prometheus has missed its initial targets, including selling carbon -neutral \nfuels by 2020. This raises concerns about the company's ability to meet its ambitious \ngoals [ 1].", metadata={'source': './docs/competitor_data.pdf', 'page': 6})

## HF Embeddings

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"

hf = HuggingFaceEmbeddings(model_name=model_name)

## HF Instructor Embeddings

In [None]:
!pip install huggingface-cli

Collecting huggingface-cli
  Downloading huggingface_cli-0.1-py3-none-any.whl (1.0 kB)
Installing collected packages: huggingface-cli
Successfully installed huggingface-cli-0.1


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", cache_folder='model')
                                                    #   model_kwargs={"device": "cuda"}, ) cache_folder = ''


load INSTRUCTOR_Transformer
max_seq_length  512


## create the DB

In [9]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [4]:
# Now we can load the persisted database from disk, and use it as normal.
persist_directory = 'db'
embedding = instructor_embeddings
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [5]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [19]:
# docs = retriever.get_relevant_documents("compare HYCO1 with prometheus fuels")
docs = retriever.invoke("compare HYCO1 with prometheus fuels")

In [23]:
len(docs)

5

In [24]:
docs

[Document(page_content="Prometheus Fuels is a startup that aims to produce carbon -neutral fuels by capturing CO2 from \nthe air and converting it into liquid fuels using renewable energy sources. Here are the pros and \ncons of Prometheus Fuels in regard to  technology, sentiment opinions, processes, and other \nfactors:  \n \nPros:  \n• The company plans to produce 50 billion e -fuels per year and capture 7 billion tons of \nCO2 by 2030, which could have a substantial impact on reducing emissions [ 1]. \n• Prometheus aims to produce fuels at a cost of $3 per gallon, which is competitive with \nfossil fuels. This could make the transition to carbon -neutral fuels more economically \nviable  [1][3].  \n• The fuels produced by Prometheus can be used in existing engines and infrastructure \nwithout significant modifications, making the transition easier [ 2][4].  \n• The company's Titan Fuel Forge and Faraday Reactor are designed to be more efficient", metadata={'page': 6, 'source': './d

In [None]:
retriever.search_type

'similarity'

In [None]:
retriever.search_kwargs

{'k': 5}

## Make a chain

In [None]:
from google.colab import userdata

API_KEY=userdata.get('API_KEY')

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
import secret

# If there is no env variable set for API key, you can pass the API key
# to the parameter `google_api_key` of the `ChatGoogleGenerativeAI` function:
# `google_api_key="key"`.
API_KEY = secret.GOOGLE_API_KEY

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",
                 temperature=0.7, top_p=0.85, google_api_key=API_KEY)

In [10]:
from langchain import PromptTemplate
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [11]:
# To extract data from WebBaseLoader
doc_prompt = PromptTemplate.from_template("{page_content}")

# To query Gemini
llm_prompt_template = """Answer the question based on the following context:
{context}
If you don't have the context or the context does not provide any useful information, write based on what you think might be the answer for this question.
Question: {question}
"""
llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] template="Answer the question based on the following context:\n{context}\nIf you don't have the context or the context does not provide any useful information, write based on what you think might be the answer for this question.\nQuestion: {question}\n"


In [12]:
setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)

In [None]:
# Create Stuff documents chain using LCEL.
# This is called a chain because you are chaining
# together different elements with the LLM.
# In the following example, to create stuff chain,
# you will combine content, prompt, LLM model and
# output parser together like a chain using LCEL.
#
# The chain implements the following pipeline:
# 1. Extract data from documents and save to variable `text`.
# 2. This `text` is then passed to the prompt and input variable
#    in prompt is populated.
# 3. The prompt is then passed to the LLM (Gemini).
# 4. Output from the LLM is passed through an output parser
#    to structure the model response.

# stuff_chain = (
#     # Extract data from the documents and add to the key `text`.
#     {
#         "text": lambda docs: "\n\n".join(
#             format_document(doc, doc_prompt) for doc in docs
#         )
#     }
#     | llm_prompt         # Prompt for Gemini
#     | llm                # Gemini function
#     | StrOutputParser()  # output parser
# )

stuff_chain = setup_and_retrieval | llm_prompt | llm | StrOutputParser()

In [None]:
stuff_chain.invoke('How does HYCO1 compare to its competitors')

'The provided text does not mention any competitors or how HYCO1 compares to them.'

In [13]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [17]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata)

In [18]:
# full example
query = "compare HYCO1 with prometheus fuels"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Based on the provided context, I can offer a limited comparison between HYCO1 and Prometheus Fuels:

**Similarities:**

* **Goal:** Both companies aim to produce synthetic fuels as a more sustainable alternative to fossil fuels.
* **CO2 Utilization:** Both companies utilize captured CO2 as a feedstock for their processes.

**Differences:**

* **Feedstock:** HYCO1's CUBE technology uses CO2 and methane (CH4) as feedstocks, while Prometheus Fuels
relies on CO2 and water.
* **Products:** HYCO1 produces syngas (hydrogen and carbon monoxide), primarily for industrial applications.
Prometheus Fuels focuses on producing liquid fuels like gasoline, diesel, and jet fuel as direct replacements
for existing fuels.
* **Technology Maturity:** The provided text suggests HYCO1 is closer to commercialization, with plans for
mass-manufacturing 1 GW standalone reactors.  Prometheus Fuels, while demonstrating promising technology, is
still working towards scaling up and proving its commercial viability.


In [None]:
retriever.invoke(query)

[Document(page_content='Monolith is a next -generation chemical and energy company that believes technology will \ncreate the path to environmental transformation. They are the only producer of cost -effective, \ncommercially viable clean hydrogen today through their proprietary methane pyrolys is process.  \nMethane Pyrolysis Process  \nMonolith has innovated methane pyrolysis, which uses 100% renewable electricity to convert \nnatural gas or biogas into hydrogen and carbon black. This process is combustion -free and CO2 -\nfree, producing virtually no scope 1 emissions and significantly reduci ng life -cycle emissions.  \nThe natural gas feedstock is super -heated by electricity, breaking the bonds between hydrogen \nand carbon molecules. The hydrogen atoms and carbon atoms are then separated, with the \nhydrogen directed to end -users and the carbon further processed into carbon  black and other \nuses.  \nCommercial Operations', metadata={'page': 26, 'source': 'cd.pdf'}),
 Document

In [None]:
# break it down
query = "What does IO-aware mean?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

 IO-aware means optimizing for reading and writing to fast/slow memory. It has a long history in computer
science and been known by many names, such as the working set model, data locality, the Rooﬂine model of
arithmetic intensity, analyses of scalability, and standard textbook treatments of computer architecture.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7f0cbf5433d0>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


## Starting again loading the db

restart the runtime

In [None]:
!unzip db.zip

Archive:  db.zip
   creating: db/
  inflating: db/chroma-collections.parquet  
   creating: db/index/
  inflating: db/index/index_metadata_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/uuid_to_id_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/index/index_59c51927-205d-4fd7-88d8-c7ba851bd2a5.bin  
  inflating: db/index/id_to_uuid_59c51927-205d-4fd7-88d8-c7ba851bd2a5.pkl  
  inflating: db/chroma-embeddings.parquet  


In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()

vectordb2 = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding,
                   )

retriever = vectordb2.as_retriever(search_kwargs={"k": 2})



In [None]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo'
)

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Pando raised $30 million in a Series B round, bringing its total raised to $45 million.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


### Chat prompts

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt.template)

{question}
