### Objective

In this notebook, we follow the tutorial of LangChian ConversationalRetrievalChain.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain.prompts import (
    ChatPromptTemplate, 
    MessagesPlaceholder, 
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import ConversationChain
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
import openai
import os

### 1. Create embeddings

In [None]:
# paper_name = 'Physics-Informed Neural Operator for Learning Partial Differential Equations'
paper_name = 'Learning the solution operator of parametric partial differential equations with physics-informed DeepOnets'
# paper_name = 'Sex-specific and opposed effects of FKBP51 in glutamatergic and GABAergic neurons_Implications for stress susceptibility and resilience'
loader = PyMuPDFLoader("./Papers/"+paper_name+".pdf")
documents = loader.load()
vectorstore_path = "./"+paper_name

In [None]:
documents[1].metadata

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text = text_splitter.split_documents(documents)

In [None]:
len(text)

In [None]:
documents[0]

In [None]:
text_splitter.split_text(text[0].page_content)

In [None]:
text[0]

In [None]:
documents[0]

In [None]:
phrases = [src.page_content for src in source]
phrases[0]

In [None]:
# OpenAI settings
openai.api_type = "azure"
openai.api_version = "2023-03-15-preview"
openai.api_base = "https://abb-chcrc.openai.azure.com/"  # Your Azure OpenAI resource's endpoint value.
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", 
                              deployment="text-embedding-ada-002",
                              openai_api_base="https://abb-chcrc.openai.azure.com/",
                              openai_api_type="azure",
                              chunk_size=1)

# if not os.path.exists(vectorstore_path) and os.path.isdir(vectorstore_path):
if not os.path.exists(vectorstore_path):
    print("Embeddings not found! Creating new ones")
    vectorstore = FAISS.from_documents(documents, embeddings)
    vectorstore.save_local(vectorstore_path)
else:
    print("Embeddings found! Loaded the computed ones")
    vectorstore = FAISS.load_local(vectorstore_path, embeddings)

In [None]:
len(documents)

### 2. Summary of the paper

In [None]:
%%time
from langchain.chains.summarize import load_summarize_chain
llm = AzureChatOpenAI(openai_api_base="https://abb-chcrc.openai.azure.com/",
                    openai_api_version="2023-03-15-preview",
                    openai_api_key=os.environ["OPENAI_API_KEY"],
                    openai_api_type="azure",
                    deployment_name="gpt-35-turbo-0301",
                    temperature=0.7)

chain = load_summarize_chain(llm, chain_type="stuff")
summary = chain.run(documents[:2])
print(summary)

**For arXiv papers**

In [None]:
from langchain.utilities import ArxivAPIWrapper

# Retrieve paper metadata
paper_arxiv_id = '2103.10974'
arxiv = ArxivAPIWrapper()
summary = arxiv.run(paper_arxiv_id)

# String manipulation
summary = summary.replace('{', '(').replace('}', ')')

### 3. Journalist bot

In [None]:
class JournalistBot:
    """Class definition for the journalist bot, created with LangChain."""
    
    def __init__(self, engine):
        """Select backbone large language model, as well as instantiate 
        the memory for creating language chain in LangChain.
        
        Args:
        --------------
        engine: the backbone llm-based chat model.
                "OpenAI" stands for OpenAI chat model;
                Other chat models are also possible in LangChain, 
                see https://python.langchain.com/en/latest/modules/models/chat/integrations.html
        """
        
        # Instantiate llm
        if engine == 'OpenAI':
            self.llm = ChatOpenAI(
                model_name="gpt-3.5-turbo",
                temperature=0.8
            )
            
        elif engine == 'Azure':
            self.llm = AzureChatOpenAI(
            openai_api_base="https://abb-chcrc.openai.azure.com/",
            openai_api_version="2023-03-15-preview",
            openai_api_key=os.environ["OPENAI_API_KEY"],
            openai_api_type="azure",
            deployment_name="gpt-35-turbo-0301",
            temperature=0.8)

        else:
            raise KeyError("Currently unsupported chat model type!")
        
        # Instantiate memory
        self.memory = ConversationBufferMemory(return_messages=True)


    def instruct(self, topic, abstract):
        """Determine the context of chatbot interaction. 
        
        Args:
        -----------    
        """
        
        self.topic = topic
        self.abstract = abstract
        
        # Define prompt template
        prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(self._specify_system_message()),
            MessagesPlaceholder(variable_name="history"),
            HumanMessagePromptTemplate.from_template("""{input}""")
        ])
        
        # Create conversation chain
        self.conversation = ConversationChain(memory=self.memory, prompt=prompt, 
                                              llm=self.llm, verbose=False)
        

    def step(self, prompt):
        response = self.conversation.predict(input=prompt)
        
        return response
        

    def _specify_system_message(self):
        """Specify the behavior of the journalist chatbot.


        Outputs:
        --------
        prompt: instructions for the chatbot.
        """       
        
        # Compile bot instructions 
        prompt = f"""You are a technical journalist interested in {self.topic}, 
        Your task is to distill a recently published scientific paper on this topic through
        an interview with the author, which is played by another chatbot.
        Your objective is to ask comprehensive and technical questions 
        so that anyone who reads the interview can understand the paper's main ideas and contributions, 
        even without reading the paper itself. 
        You're provided with the paper's summary to guide your initial questions.
        You must keep the following guidelines in mind:
        - Focus exclusive on the technical content of the paper.
        - Avoid general questions about {self.topic}, focusing instead on specifics related to the paper.
        - Only ask one question at a time.
        - Feel free to ask about the study's purpose, methods, results, and significance, 
        and clarify any technical terms or complex concepts. 
        - Your goal is to lead the conversation towards a clear and engaging summary.
        - Do not include any prefixed labels like "Interviewer:" or "Question:" in your question.
        
        [Abstract]: {self.abstract}"""
        
        
        return prompt

### 4. Author bot

In [None]:
class AuthorBot:
    """Class definition for the author bot, created with LangChain."""
    
    def __init__(self, engine, vectorstore):
        """Select backbone large language model, as well as instantiate 
        the memory for creating language chain in LangChain.
        
        Args:
        --------------
        engine: the backbone llm-based chat model.
                "OpenAI" stands for OpenAI chat model;
                Other chat models are also possible in LangChain, 
                see https://python.langchain.com/en/latest/modules/models/chat/integrations.html
        """
        
        # Instantiate llm
        if engine == 'OpenAI':
            self.llm = ChatOpenAI(
                model_name="gpt-3.5-turbo",
                temperature=0.6
            )
            
        elif engine == 'Azure':
            self.llm = AzureChatOpenAI(
            openai_api_base="https://abb-chcrc.openai.azure.com/",
            openai_api_version="2023-03-15-preview",
            openai_api_key=os.environ["OPENAI_API_KEY"],
            openai_api_type="azure",
            deployment_name="gpt-35-turbo-0301",
            temperature=0.6)

        else:
            raise KeyError("Currently unsupported chat model type!")
        
        # Instantiate memory
#         self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) 
        self.chat_history = []
        
        # Instantiate embedding index
        self.vectorstore = vectorstore
        
        
        
    def instruct(self, topic):
        """Determine the context of chatbot interaction. 
        
        Args:
        -----------    
        """
        # Specify topic
        self.topic = topic
        
        general_system_template = r""" 
        Given a specific context, please give a short answer to the question, covering the required advices in general and then provide the names all of relevant(even if it relates a bit) products. 
         ----
        {context}
        ----
        """
        
        # Define prompt template
        qa_prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template("{question}")
        ])
        
        # Create conversation chain
        self.conversation_qa = ConversationalRetrievalChain.from_llm(llm=self.llm, 
                                                                     retriever=self.vectorstore.as_retriever(
                                                                         search_kwargs={"k": 3}),
                                                                    chain_type="stuff", return_source_documents=True,
                                                                    combine_docs_chain_kwargs={'prompt': qa_prompt})

        
        
    def step(self, prompt):
        response = self.conversation_qa({"question": prompt, "chat_history": self.chat_history})
        self.chat_history.append((prompt, response["answer"]))
        
        return response["answer"], response["source_documents"]
        
        
        
    def _specify_system_message(self):
        """Specify the behavior of the author chatbot.


        Outputs:
        --------
        prompt: instructions for the chatbot.
        """       
        
        # Compile bot instructions 
        prompt = f"""You are the author of a recently published scientific paper on {self.topic}.
        You are being interviewed by a technical journalist who is played by another chatbot and
        looking to write an article to summarize your paper.
        Your task is to provide comprehensive, clear, and accurate answers to the journalist's questions.
        Please keep the following guidelines in mind:
        - Try to explain complex concepts and technical terms in an understandable way, without sacrificing accuracy.
        - Your responses should primarily come from the relevant content of this paper, 
        which will be provided to you in the following, but you can also use your broad knowledge in {self.topic} to 
        provide context or clarify complex topics. 
        - Remember to differentiate when you are providing information directly from the paper versus 
        when you're giving additional context or interpretation. Use phrases like 'According to the paper...' for direct information, 
        and 'Based on general knowledge in the field...' when you're providing additional context.
        - Only answer one question at a time. Ensure that each answer is complete before moving on to the next question.
        - Do not include any prefixed labels like "Author:", "Interviewee:", Respond:", or "Answer:" in your answer.
        """
        
        prompt += """Given the following context, please answer the question.
        
        {context}"""
        
        return prompt

### 5. Testing

Standalone test

In [None]:
llm = AzureChatOpenAI(
            openai_api_base="https://abb-chcrc.openai.azure.com/",
            openai_api_version="2023-03-15-preview",
            openai_api_key=os.environ["OPENAI_API_KEY"],
            openai_api_type="azure",
            deployment_name="gpt-35-turbo-0301",
            temperature=0.7)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) 
topic = 'physics-informed machine learning'

prompt = f"""You are an expert in {topic} and 
your primary role is to provide detailed answers to questions asked by a 'journalist bot'. 
You should help the journalist bot to understand the key points of your paper.
Try to explain complex concepts and technical terms in an understandable way, without sacrificing accuracy.
Your responses should primarily come from the relevant content of this paper, 
which will be provided to you in the following, but you can also use your broad knowledge in {topic} to 
provide context or clarify complex topics. Remember to differentiate when you are providing information 
directly from the paper versus when you're giving additional context or interpretation. 
Use phrases like 'According to the paper...' for direct information, 
and 'Based on general knowledge in the field...' when you're providing additional context."""

prompt += """Given the following context, please answer the question.

{context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(prompt),
    HumanMessagePromptTemplate.from_template("{question}")
])

In [None]:
conversation_qa = ConversationalRetrievalChain.from_llm(llm=llm, verbose=False,
                                                        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
                                                        chain_type="stuff", return_source_documents=True,
                                                        combine_docs_chain_kwargs={'prompt': qa_prompt})

In [None]:
chat_history = []
result = conversation_qa({"question": question, "chat_history": chat_history})

In [None]:
docs = vectorstore.similarity_search(question)

In [None]:
docs[0]

In [None]:
print(result['source_documents'][0].__dict__['page_content'][:200])

In [None]:
result['source_documents'][0].__dict__['metadata']

In [None]:
response = conversation_qa({"question": question})

In [None]:
conversation_qa

Integration test

In [None]:
# Instantiate journalist and author bot
journalist = JournalistBot('Azure')
author = AuthorBot('Azure', vectorstore)

In [None]:
# Provide instruction
journalist.instruct(topic='physics-informed machine learning', abstract=summary)
author.instruct('physics-informed machine learning')

In [None]:
# Conversation
question_hist = []
answer_list = []

In [None]:
question = journalist.step('Start the conversation')
print(question)

question_hist.append(question)

In [None]:
answer, source = author.step(question)
print(answer)

answer_list.append(answer)

In [None]:
question = journalist.step(answer)
print(question)

question_hist.append(question)

In [None]:
answer, source = author.step(question)
print(answer)

answer_list.append(answer)

In [None]:
import fitz

def highlight_text(file_path, phrases, output_path):
    doc = fitz.open(file_path)
    
    for page in doc:
        for phrase in phrases:            
            text_instances = page.search_for(phrase)

            for inst in text_instances:
                highlight = page.add_highlight_annot(inst)
    
    doc.save(output_path, garbage=4)

In [None]:
paper = 'Learning the solution operator of parametric partial differential equations with physics-informed DeepOnets'
paper_path = "./Papers/"+paper+".pdf"
phrases = [src.page_content for src in source]
page_numbers = [str(src.metadata['page']+1) for src in source]

highlight_text(paper_path, phrases, "highlighted.pdf")

In [None]:
doc[0].search_for(phrases[0][:100])

In [None]:
phrases[0]

In [None]:
doc[0].get_text("text")

In [None]:
page_numbers

In [None]:
doc[0].get_text("text")

In [None]:
import fitz
doc = fitz.open(paper_path)

for page in doc:
    for phrase in phrases:            
        text_instances = page.search_for(phrase)
        print(text_instances)

        for inst in text_instances:
            highlight = page.add_highlight_annot(inst)

# doc.save(output_path, garbage=4)

In [None]:
phrases[1]

Formulation

In [None]:
source[0].page_content[:200]

In [None]:
source[0].metadata['page']+1

In [None]:
doc.page_content[:100]

In [None]:
string = f"""For details, check: \n\n"""
for doc in source[:2]:
    string += f"page {doc.metadata['page']+1}, start with '{doc.page_content[:100]}'\n \n"
print(string)

User interface

In [None]:
# Instantiate journalist and author bot
journalist = JournalistBot('Azure')
author = AuthorBot('Azure', vectorstore)

# Provide instruction
journalist.instruct(topic='neuroscience stress-related disorder FKBP51', abstract=summary)
author.instruct('neuroscience stress-related disorder FKBP51')

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Create a text area for the chat log
chat_log = widgets.HTML(
    value='',
    placeholder='',
    description='',
)

# Create a text input field for user input
user_input = widgets.Text(
    value='',
    placeholder='Question',
    description='',
    disabled=False,
    layout=widgets.Layout(width="60%")
)

# Create buttons for continue and send
bot_button = widgets.Button(description="Bot ask")
user_button = widgets.Button(description="User ask")

# Define the button click callbacks
def on_bot_button_clicked(b):
    if chat_log.value == '':
        # The conversation is just starting
        bot_question = journalist.step("Start the conversation")
    else:
        # The conversation is ongoing, generate a question based on the last response from author_bot
        bot_question = journalist.step(chat_log.value.split("<br><br>")[-1])
    
    chat_log.value += "<br><b style='color:blue'>Journalist Bot:</b> " + bot_question
    
    # Author bot responds to the question
    bot_response = author.step(bot_question)
    chat_log.value += "<br><br><b style='color:green'>Author Bot:</b> " + bot_response + "<br>"

    
def on_user_button_clicked(b):
    # User asks a question
    bot_response = author.step(user_input.value)
    chat_log.value += "<br><br><b style='color:purple'>You:</b> " + user_input.value
    chat_log.value += "<br><br><b style='color:green'>Author Bot:</b> " + bot_response + "<br>"
    
    # Inform journalist bot about the asked questions 
    journalist.memory.chat_memory.add_user_message(user_input.value)
    
    # Clear user input
    user_input.value = ""

# Attach the callbacks
bot_button.on_click(on_bot_button_clicked)
user_button.on_click(on_user_button_clicked)

# Use HBox and VBox for arranging the widgets
first_row = widgets.HBox([bot_button])
second_row = widgets.HBox([user_button, user_input])

# Display the UI
display(chat_log, widgets.VBox([first_row, second_row]))