In [1]:
import pandas as pd
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import dotenv_values
import sys
sys.path.insert(0,'/workspaces/LLM_research_assistant/src')
from utils.utils import load_yaml, init_argument_parser, sanitize_output, fill_default_parameters
from langchain.prompts import (
    ChatPromptTemplate, PromptTemplate
)
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import random
import numpy as np
from functools import partial
from typing import List
from langchain.embeddings import OpenAIEmbeddings


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders import WebBaseLoader
from utils.custom_grobid_parser import CustomGrobidParser
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings
import bs4
from langchain_core.runnables import RunnablePassthrough

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain.vectorstores import VectorStore,Chroma


In [3]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [4]:
seed = 156
np.random.seed(seed)
random.seed(seed)

In [5]:
template_file = "../data/templates/write_paragraph_template.txt"
rag_template_file = "../data/rag_templates/basic_rag_suffix.txt"
task_file = "../data/tasks/pcir.txt"
papers_folder = "../data/papers/"
model_name = "gpt-3.5-turbo-0613"

In [6]:
env = dotenv_values()

In [7]:
with open(template_file) as f:
    template = f.read()
with open(rag_template_file) as f:
    template_rag = template +"\n" + f.read()

use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)
prompt_parameters = dict()
with open(task_file) as f:
    prompt_parameters["input"] = f.read()

use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)

In [18]:
def build_scientific_papers_loader(papers_folder:str)->List[Document]: 
    loader = GenericLoader.from_filesystem(
        papers_folder,
        glob="*",
        suffixes=[".pdf"],
        parser=CustomGrobidParser(segment_sentences=False),
    )
    docs = loader.load()
    return docs

In [9]:
def build_documents_retriever(docs:List[Document],
                              embeddings:Embeddings,
                              chunk_size:int=3500,
                              chunk_overlap:int=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(docs)

    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    retriever = vectorstore.as_retriever()
    return retriever

In [10]:
docs =  build_scientific_papers_loader(papers_folder)
retriever = build_documents_retriever(docs, embeddings=OpenAIEmbeddings())

KeyboardInterrupt: 

In [19]:
prompt_parameters["input"]

'What are the main components and they key innovations of Progressive Contextual Instance Refinement (PCIR)?'

In [61]:
retriever.get_relevant_documents( prompt_parameters["input"], k=22)

[Document(page_content='Combining DCIR with PPSP formulates a progressive contextual instance refinement (PCIR) process.', metadata={'bboxes': "[[{'page': '2', 'x': '321.96', 'y': '334.52', 'h': '241.38', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '346.40', 'h': '200.51', 'w': '8.76'}], [{'page': '2', 'x': '518.06', 'y': '346.40', 'h': '45.03', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '358.40', 'h': '250.99', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '370.28', 'h': '251.31', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '382.28', 'h': '216.93', 'w': '8.76'}], [{'page': '2', 'x': '533.16', 'y': '382.28', 'h': '30.08', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '394.28', 'h': '251.12', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '406.16', 'h': '251.01', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '418.16', 'h': '251.11', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '430.16', 'h': '23.10', 'w': '8.76'}], [{'page': '2', 'x': '338.37', 'y': '430.16', 'h':

In [62]:
template

'The user will provide one or more questions related to a scientific argument. \nYou are a paper writing assistant that writes a paragraph that answers to the questions.\nNotice that the paragraph will be part of a scientific paper, so the writing style must be technical and formal, with the insertion of appropriate citations.\n\n'

In [63]:
prompt = ChatPromptTemplate.from_messages([("system", template), ("human", "{input}")])


In [64]:
chain =  prompt | model | StrOutputParser() 

In [65]:
print(prompt.format(**prompt_parameters))


System: The user will provide one or more questions related to a scientific argument. 
You are a paper writing assistant that writes a paragraph that answers to the questions.
Notice that the paragraph will be part of a scientific paper, so the writing style must be technical and formal, with the insertion of appropriate citations.


Human: What are the main components and they key innovations of Progressive Contextual Instance Refinement (PCIR)?


In [66]:
response = chain.invoke(prompt_parameters)


In [67]:
response

'Progressive Contextual Instance Refinement (PCIR) is a novel approach in the field of computer vision that aims to improve the accuracy of object detection and instance segmentation tasks. The main components of PCIR include a backbone network, a contextual module, and an instance refinement module. The backbone network is responsible for extracting high-level features from the input image. The contextual module incorporates contextual information by aggregating features from different levels of the backbone network. This allows the model to capture both local and global context, leading to more accurate predictions. The instance refinement module further refines the predicted instances by iteratively updating the instance masks based on the contextual information. This iterative refinement process helps to improve the segmentation accuracy by reducing false positives and refining the boundaries of the predicted instances. The key innovation of PCIR lies in its progressive refinement 

In [42]:
#prompt_parameters["context"] = retriever | format_docs

In [43]:
prompt_rag = ChatPromptTemplate.from_messages([("system", template_rag), ("human", "{input}")])


In [44]:
chain_rag =   {"context": retriever | format_docs, "input": RunnablePassthrough()} | prompt_rag | model | StrOutputParser() 

In [2]:
context = (retriever | format_docs).invoke(prompt_parameters['input'])
context

NameError: name 'retriever' is not defined

In [1]:
prompt_parameters['context'] = context

NameError: name 'context' is not defined

In [68]:
print(prompt_rag.format(**prompt_parameters))


KeyError: 'context'

In [47]:
response_rag = chain_rag.invoke(prompt_parameters['input'])


In [48]:
response_rag

'The main components of Progressive Contextual Instance Refinement (PCIR) are the DCIR module and the PPSP module. The DCIR module is responsible for mining all potential instances existing in the images, while the PPSP module removes noisy negative samples and maintains the diversity of the samples. These two modules work together to formulate the PCIR process, which is a progressive contextual instance refinement approach. The key innovation of PCIR lies in its ability to progressively refine the instance predictions by iteratively updating the instance proposals and removing false positives. This iterative refinement process improves the accuracy and reliability of object detection in remote sensing images. [1]\n\n[1] Progressive Contextual Instance Refinement for Weakly Supervised Object Detection in Remote Sensing Images.'

In [10]:
db = Chroma(persist_directory='/workspaces/LLM_research_assistant/data/db/chroma', embedding_function=OpenAIEmbeddings())


In [11]:
db.similarity_search(prompt_parameters['input'])

[Document(page_content='Combining DCIR with PPSP formulates a progressive contextual instance refinement (PCIR) process.', metadata={'bboxes': "[[{'page': '2', 'x': '321.96', 'y': '334.52', 'h': '241.38', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '346.40', 'h': '200.51', 'w': '8.76'}], [{'page': '2', 'x': '518.06', 'y': '346.40', 'h': '45.03', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '358.40', 'h': '250.99', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '370.28', 'h': '251.31', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '382.28', 'h': '216.93', 'w': '8.76'}], [{'page': '2', 'x': '533.16', 'y': '382.28', 'h': '30.08', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '394.28', 'h': '251.12', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '406.16', 'h': '251.01', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '418.16', 'h': '251.11', 'w': '8.76'}, {'page': '2', 'x': '312.00', 'y': '430.16', 'h': '23.10', 'w': '8.76'}], [{'page': '2', 'x': '338.37', 'y': '430.16', 'h':

In [21]:
retriever = db.as_retriever()


In [22]:
context = (retriever | format_docs).invoke(prompt_parameters['input'])


In [23]:
context

''