In [None]:
!pip install -r "requirements.txt"



In [None]:
#Libraries
import os
import time
import warnings
warnings.filterwarnings('ignore')
from google.colab import userdata

In [None]:
#Langchain and it's components
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

In [None]:
#Data Ingestion
loader = TextLoader("/content/OpenAI.txt")
documents = loader.load()
documents

[Document(metadata={'source': '/content/OpenAI.txt'}, page_content='OpenAI is an artificial intelligence research and deployment company founded in December 2015 by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, Wojciech Zaremba, and others, with the mission to ensure that artificial general intelligence (AGI) benefits all of humanity. Its goal is to develop safe and beneficial AI while making its access widespread, focusing on advancing AGI responsibly and in ways that benefit society. OpenAI is known for developing innovative technologies such as the GPT (Generative Pre-trained Transformer) series, which can generate human-like text for applications in writing, coding, customer support, education, and more. It also created DALL·E for generating images from textual descriptions, Codex for understanding and generating programming code, and Whisper for automatic speech recognition. Initially a nonprofit organization, OpenAI transitioned to a "capped-profit" model to secure fundin

In [None]:
#Splitting the data
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)

In [None]:
text=text_splitter.split_documents(documents)
text

[Document(metadata={'source': '/content/OpenAI.txt'}, page_content='OpenAI is an artificial intelligence research and deployment company founded in December 2015 by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, Wojciech Zaremba, and others, with the mission to ensure that artificial general intelligence (AGI) benefits all of humanity. Its goal is to develop safe and beneficial AI while making its access widespread, focusing on advancing AGI responsibly and in ways that benefit society. OpenAI is known for developing innovative technologies such as the GPT (Generative Pre-trained Transformer) series, which can generate human-like text for applications in writing, coding, customer support, education, and more. It also created DALL·E for generating images from textual descriptions, Codex for understanding and generating programming code, and Whisper for automatic speech recognition. Initially a nonprofit organization, OpenAI transitioned to a "capped-profit" model to secure fundin

In [None]:
#Embedding
embeddings = HuggingFaceEmbeddings()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
#DB Setup
from pinecone import Pinecone, ServerlessSpec

In [None]:
##Building connection with DB
pc = Pinecone(api_key='pcsk_3ViMpX_5Nzyz3WSo9NqdvUdkGrBA8DZQgoYW56Uuyg7jqgLBEsB39aAzgBzC276oeJsTqy', environment="us-east-1")

In [None]:
#setting up server
cloud = 'aws'
region = 'us-east-1'
server = ServerlessSpec(cloud=cloud,region=region)

In [None]:
#Database Creation
name_of_database = 'rag-app-2'

In [None]:
# checking whether there is a existing database with the name
if name_of_database not in pc.list_indexes().names():
  # We can create our database
  pc.create_index(
      name = name_of_database,
      dimension = 768,
      metric = "cosine",
      spec = server,
  )


  # taking a confirmation that our database is created or not
  while not pc.describe_index(name_of_database).status["ready"]:
    time.sleep(1)

print(pc.Index(name_of_database).describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


pc.describe_index(name_of_database): Retrieves details about the specified database.

.status["ready"]: Checks whether the database is ready for use.

while not: This loop ensures that the script waits until the database is fully initialized and ready.

time.sleep(1): Introduces a delay of 1 second between checks to avoid overwhelming the server with frequent requests.

In [None]:
#Adding the data into our vectorStore
from langchain.vectorstores import Pinecone

In [None]:
os.environ["PINECONE_API_KEY"]=userdata.get("Pinecone_Token")

In [None]:
if name_of_database not in pc.list_indexes():
  textsearch = Pinecone.from_documents(text,embeddings,index_name=name_of_database)
else:
  textsearch = Pinecone.from_existing_index(name_of_database,embeddings,pinecone_index=pc.Index(name_of_database))

Model Setup

In [None]:
#https://huggingface.co/openai-community/gpt2

In [None]:
repo = "openai-community/gpt2"

In [None]:
from langchain.llms import HuggingFaceHub

In [None]:
#Model Initialize
model = HuggingFaceHub(repo_id=repo,huggingfacehub_api_token=userdata.get("MyToken1"))

In [None]:
model

HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', huggingfacehub_api_token='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')

In [None]:
#Prompting
from langchain.prompts import PromptTemplate

In [None]:
template = """You are a helpful assistant. Answer the following question concisely:
Context: {context}
Questions: {question}
Helpful Answer:"""

In [None]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:
#Chaining
chain=RetrievalQA.from_chain_type(llm=model,
                                retriever=textsearch.as_retriever())

In [None]:
chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', huggingfacehub_api_token='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'), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['Pinecone', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores

In [None]:
query = "What is OpenAI?"

In [None]:
results=chain.run(query)
results

'Use the following pieces of context to answer the question at the end. If you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nOpenAI is an artificial intelligence research and deployment company founded in December 2015 by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, Wojciech Zaremba, and others, with the mission to ensure that artificial general intelligence (AGI) benefits all of humanity. Its goal is to develop safe and beneficial AI while making its access widespread, focusing on advancing AGI responsibly and in ways that benefit society. OpenAI is known for developing innovative technologies such as the GPT (Generative Pre-trained Transformer) series, which can generate human-like text for applications in writing, coding, customer support, education, and more. It also created DALL·E for generating images from textual descriptions, Codex for understanding and generating programming code, and Whisper for automatic speech recognitio