In [None]:
!pip install langchain pinecone-client openai tiktoken langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.0.7-py3-none-any.whl (33 kB)
Installing collected packages: langchain_openai
Successfully installed langchain_openai-0.0.7


In [None]:
import os
# import getpass
os.environ["OPENAI_API_KEY"] = ""       #@param {type:"string"}
os.environ["PINECONE_API_KEY"] = ""      #@param {type:"string"}
os.environ["PINECONE_ENV"] = "gcp-starter"  #@param {type:"string"}

from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import TextLoader
from langchain.schema import SystemMessage
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.pydantic_v1 import BaseModel, Field, validator
import langchain, pinecone

from pinecone import Pinecone as pc
from pinecone import PodSpec

In [None]:
print(f"langchain version: {langchain.__version__}")
print(f"pinecone version: {pinecone.__version__}")

langchain version: 0.1.9
pinecone version: 3.1.0


In [None]:
#+============================= text processing =============================+
loader = TextLoader("./example.txt")
documents = loader.load()
#splitting the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)


#+============================== embedding text =============================+
llm= OpenAI()
embeddings = OpenAIEmbeddings()

pc_ = pc(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "example-index"

if index_name not in pc_.list_indexes().names():
    pc_.create_index(name=index_name, metric="cosine", dimension=1536, spec=PodSpec(environment="gcp-starter") )
    docsearch = Pinecone.from_documents(texts, embeddings, index_name=index_name)
else:
    docsearch = Pinecone.from_existing_index(index_name, embeddings)


#+============================== define model ===============================+
chain = load_qa_chain(llm, chain_type="stuff")

class question_answer(BaseModel):
  question: str = Field(..., description = "Question framed.")
  answer: str = Field(..., description = "Answer to the question.")

class output(BaseModel):
  output: list[question_answer] = []

parser = PydanticOutputParser(pydantic_object=output)

#+================================== prompt ==================================+
prompt = '''You are a dataset creation machine. You make dataset from a given data. Create as much question answer set as you can. Make sure you do not repeat questions and you cover every relevant topic to make the dataset.

Data Provided : {text}

{format_instructions}

Output:'''

#+=============================== calling LLM ===============================+
dataset = []
chat_llm= ChatOpenAI()
for text in texts:
  _prompt = PromptTemplate(template =  prompt, input_variables = ["text"], partial_variables={"format_instructions": parser.get_format_instructions()})

  _input = _prompt.format_prompt(text = text)
  message = [
    SystemMessage(content = _input.to_string())
  ]

  result = chat_llm(message).content

  parsed_output = parser.parse(result)
  dataset.extend(parsed_output.output)
  print(dataset)

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


[question_answer(question="What is Toolformer in the context of Meta AI's recent approach?", answer="Toolformer is Meta AI's recent approach to fusing large language models (LLM) with external APIs."), question_answer(question='What does Toolformer combine in terms of programming paradigm?', answer='Toolformer combines zero-shot machine-learning methodology with traditional software interfaces, potentially starting a new programming paradigm.'), question_answer(question='What has led to a new wave of interest in large language models according to the content?', answer='The widespread success of ChatGPT has led to a new wave of interest in large language models.'), question_answer(question="How are people reacting to ChatGPT's capabilities on Twitter?", answer='People are impressed by the capabilities of ChatGPT on Twitter and herald it as the start of a new age of AI.'), question_answer(question='What caution does the author mention about looking at new technology?', answer='The author

A sample of the question and answer dataset is shown below

In [None]:
for i in range(5):
  print(dataset[i])

question="What is Toolformer in the context of Meta AI's recent approach?" answer="Toolformer is Meta AI's recent approach to fusing large language models (LLM) with external APIs."
question='What does Toolformer combine in terms of programming paradigm?' answer='Toolformer combines zero-shot machine-learning methodology with traditional software interfaces, potentially starting a new programming paradigm.'
question='What has led to a new wave of interest in large language models according to the content?' answer='The widespread success of ChatGPT has led to a new wave of interest in large language models.'
question="How are people reacting to ChatGPT's capabilities on Twitter?" answer='People are impressed by the capabilities of ChatGPT on Twitter and herald it as the start of a new age of AI.'
question='What caution does the author mention about looking at new technology?' answer='The author mentions that it is important to look at new technology without rose-colored glasses and to b

In [None]:
#+================================= QA Bot ==================================+
prompts=["who is meta?","what is a toolformer?","what is an llm?"]

for i in range(3):
  if not prompts:
    question = input(f"Question {i+1}> ")
  else:
    question=prompts[i]
    print(f"Question {i+1}:{question}")
  docs = docsearch.similarity_search(question)
  respond = chain.run(input_documents=docs, question=question)
  print(f"Response {i+1}:{respond}")



Question 1:who is meta?


  warn_deprecated(


Response 1:
Meta is the company that produced the recent paper discussing the solution for LLMs using external tools via simple APIs. They also developed Toolformer, a framework that integrates various tools to solve LLM problems.
Question 2:what is a toolformer?
Response 2:

A toolformer is a framework developed by Meta AI that integrates large language models with external tools through simple APIs. It aims to solve issues with LLMs that often produce false or inaccurate information by incorporating a variety of tools, such as a calculator, Q&A system, search engines, translation system, and calendar.
Question 3:what is an llm?
Response 3:
An LLM is a large language model, a type of artificial intelligence technology that uses machine learning to generate natural-sounding text.
