#Research Assistant

.

In [None]:
!pip install openai
!pip install Langchain
!pip install tiktoken
!pip install docarray
!pip install pypdf
!pip install faiss-gpu

.

#1. Import required libraries.

In [2]:
import urllib.request
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.chat_models import ChatOpenAI
import os
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain, LLMChain
import warnings
warnings.filterwarnings('ignore')

.

#2. Functions to read flat files & pdf files.

In [3]:
def read_flat_file(file_name):
  with open(file_name) as f:
    file_content = f.read()
  return file_content

def read_pdf_file(file_name, chunk_size=1500, chunk_overlap=150):
  loader = PyPDFLoader(file_path=file_name)
  pages = loader.load()
  splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  splits = splitter.split_documents(pages)
  return splits

.

#3. Function to download PDF file.

In [4]:
def download_and_read_pdf_file(file_url):
  file_name = file_url.split("/")[-1]
  data = urllib.request.urlretrieve(file_url, file_name)
  splits = read_pdf_file(file_name, chunk_size=1500, chunk_overlap=150)
  return splits

.

#4. Load OpenAI keys, instantiate client objects & load embeddings.

In [5]:
os.environ["OPENAI_API_KEY"] = read_flat_file("openai_key")

chatgpt3_5_turbo = ChatOpenAI(model="gpt-3.5-turbo")
chatgpt3_5_turbo_instruct = ChatOpenAI(model="gpt-3.5-turbo-instruct")
chatgpt4 = ChatOpenAI(model="gpt-4")

embedding = OpenAIEmbeddings(model="text-embedding-3-large")

.

#5. Function to create Vector Database from document splits.

In [6]:
def create_vectordb_from_document(document_splits, embedding):
  return  FAISS.from_documents(document_splits, embedding=embedding)

.

#6. Function to get Output Parsers.

In [7]:
sizes = {
    "abstract": 50,
    "introduction": 300,
    "methodology": 500,
    "results": 200,
    "conclusion": 100
}

def get_component_output_parser(component):
  summary = ResponseSchema(name=f"{component}", description=f"Very brief summary of the {component} in less than {sizes[component]} words.")
  return StructuredOutputParser.from_response_schemas([summary])

def get_summary_output_parser():
  summary = ResponseSchema(name=f"summary", description=f"Summary of the research paper in 1000 words.")
  return StructuredOutputParser.from_response_schemas([summary])

def get_qa_output_parser():
  answer = ResponseSchema(name=f"answer", description=f"Answer to the question asked in only one sentence.")
  return StructuredOutputParser.from_response_schemas([answer])

.

#7. Function to get prompt templates.

In [8]:
def get_component_prompt_template():
  prompt_template_text = """
  You will be provided with {component} part of a research paper enclosed within {delimiter} delimiter.
  Please provide a summary of this {component}.
  Please make sure that the summary is concise and to the point.

  <Note>
  The {component} part of the research paper can have some information unrealated to the {component}. You must ignore it.

  <Abstract>

  {delimiter}{context}{delimiter}

  {instructions}

  """
  return PromptTemplate(template=prompt_template_text, input_variables=["context", "delimiter", "instructions", "component"])

def get_qa_prompt_template():
  prompt_template_text = """
  You will be provided with a question (delimited with {question_delimiter}) pertaining to a research paper.
  You will also be provided with a relevant context (delimited with {context_delimiter}) extracted from a research paper.

  Please answer the question keeping only the context in mind.
  Answer must be one sentence long.

  <Question>

  {question_delimiter}{question}{question_delimiter}

  <Context>

  {context_delimiter}{context}{context_delimiter}

  {instructions}

  Let me remind again. Answer must be one sentence long.

  """
  return PromptTemplate(template=prompt_template_text, input_variables=["context", "delimiter", "instructions", "component"])

def get_summary_prompt_template():
  prompt_template_text = """
  You will be provided with details of {components_list} from a research paper enclosed within {delimiter} delimiter.
  Please provide a combined summary from it.
  Please make sure that the summary is concise and to the point.

  {component_summary}

  {instructions}

  """
  return PromptTemplate(template=prompt_template_text, input_variables=["component_summary", "delimiter", "instructions", "components_list"])

.

#8. Functions to get QA & LLM chains.

In [9]:
def get_qa_chain(chat_client, prompt):
  return load_qa_chain(llm=chat_client, chain_type="stuff", prompt=prompt)

def get_llm_chain(chat_client, prompt):
  return LLMChain(llm=chat_client, prompt=prompt)

.

#9. Functions to run QA & LLM chains.

In [10]:
queries = {
    "abstract": "abstract",
    "introduction": "Extract the introduction section discussing background information and research objectives.",
    "methodology": "Get the methodology section detailing experimental design, data collection, and analysis techniques.",
    "results": "results",
    "conclusion": "conclusion"
}

def run_component_qa_chain(component, chat_client, vectordb):
  query = "What are the skills and educational qualifications of the candidate?"
  prompt = get_component_prompt_template()
  output_parser = get_component_output_parser(component)
  instructions = output_parser.get_format_instructions()
  context = vectordb.similarity_search(query, k=1)
  chain = get_qa_chain(chat_client, prompt)
  prompt_inputs = {"input_documents": context, "delimiter": "###", "instructions": instructions,
                   "component": component, "words": sizes[component]}
  response = chain(prompt_inputs, return_only_outputs=True)
  response_dict = output_parser.parse(response["output_text"])
  return response_dict

def run_summary_llm_chain(chat_client, component_summary, components_list):
  prompt = get_summary_prompt_template()
  output_parser = get_summary_output_parser()
  instructions = output_parser.get_format_instructions()
  chain = get_llm_chain(chat_client, prompt)
  response = chain({"component_summary": component_summary, "delimiter": "###", "instructions": instructions,
                   "components_list": ", ".join(components_list)}, return_only_outputs=True)
  response_dict = output_parser.parse(response['text'])
  return response_dict

def run_qa_chain(question, chat_client, vectordb):
  query = question
  prompt = get_qa_prompt_template()
  output_parser = get_qa_output_parser()
  instructions = output_parser.get_format_instructions()
  context = vectordb.similarity_search(query, k=1)
  chain = get_qa_chain(chat_client, prompt)
  prompt_inputs = {"input_documents": context, "context_delimiter": "###", "question_delimiter": "$$$",
                   "instructions": instructions, "question": question}
  response = chain(prompt_inputs, return_only_outputs=True)
  response_dict = output_parser.parse(response["output_text"])
  return response_dict

.

#10. Function to get Research Paper Summary.

In [11]:
def get_summary(paper_url):
  paper = download_and_read_pdf_file(paper_url)
  vectordb = create_vectordb_from_document(paper, embedding)
  components = ["abstract", "introduction", "methodology", "results", "conclusion"]
  summary = ""
  for component in components:
    response = run_component_qa_chain(component, chatgpt3_5_turbo, vectordb)
    summary += "\n\n\n\n<" + component + ">\n\n"
    summary += "###" + response[component] + "###"
  response = run_summary_llm_chain(chat_client=chatgpt3_5_turbo, component_summary=summary, components_list=components)
  return response['summary'].replace(".", ".\n")

.

#11. Function to ask questions about research paper.

In [12]:
def get_answer(paper_url, question, llm=chatgpt3_5_turbo):
  paper = download_and_read_pdf_file(paper_url)
  vectordb = create_vectordb_from_document(paper, embedding)
  response = run_qa_chain(question, llm, vectordb)
  return response['answer']

.

#12. Testing the summary application

In [13]:
response = get_summary(paper_url="https://arxiv.org/pdf/1301.3781.pdf")
print("Summary:\n")
print(response)

Summary:

The research paper presents a comprehensive test set consisting of five types of semantic questions and nine types of syntactic questions.
 The test set includes examples such as word pairs related to capital cities, currencies, city-state relationships, and word forms.
 The quality of word vectors is measured using this test set, which contains a total of 8869 semantic questions and 10675 syntactic questions.



In [14]:
response = get_summary(paper_url="https://arxiv.org/pdf/2402.06525.pdf")
print("Summary:\n")
print(response)

Summary:

The research paper discusses the use of flexible infinite-width graph convolutional networks (GCNs) for representation learning and the importance of finding suitable hyperparameters for these networks.
 The authors propose using Gram layers and the ReLU kernel non-linearity instead of linear layers and ReLU non-linearities in GCN architectures.
 They also discuss the use of global kernel norm instead of batch normalization.
 The authors conducted grid searches to determine suitable hyperparameters for the GCN and graph convolutional DKM models.
 They experimented with different settings such as hidden units, dropout, batch normalization, row normalization, regularization, number of layers, inducing points, and Gram matrix initializations.
 The paper highlights the importance of using random initialization for certain datasets to ensure numerical stability.
 Overall, the research paper emphasizes the significance of representation learning in graph convolutional networks.



.

#13. Testing the question answer application

In [15]:
response = get_answer(paper_url="https://arxiv.org/pdf/1301.3781.pdf",
                     question="Who are the authors of this paper?",
                     llm=chatgpt3_5_turbo)
print("Answer:\n")
print(response)

Answer:

Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean


In [16]:
response = get_answer(paper_url="https://arxiv.org/pdf/1301.3781.pdf",
                     question="What did they accomplish in this paper?",
                     llm=chatgpt3_5_turbo)
print("Answer:\n")
print(response)

Answer:

In this paper, they studied the quality of word vectors derived by various models on a collection of syntactic and semantic language tasks.


In [17]:
response = get_answer(paper_url="https://arxiv.org/pdf/2402.06525.pdf",
                     question="Who are the authors of this paper?")
print(response)

LeCun, Y ., Bengio, Y ., and Hinton, G.


In [18]:
response = get_answer(paper_url="https://arxiv.org/pdf/2402.06525.pdf",
                     question="What did they accomplish in this paper?")
print(response)

This paper aims to advance the field of machine learning and understanding of representation learning in graphs.
