In [56]:
import sys
import subprocess
from dotenv import dotenv_values

subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt', '--quiet'])

import os
import openai
import fasttext
import gradio as gr

from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from huggingface_hub import hf_hub_download
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import AIMessage, HumanMessage
from langchain.embeddings.openai import OpenAIEmbeddings

In [57]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

loader = TextLoader("dataset_data_sharing.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
FAISS.from_documents(docs, OpenAIEmbeddings()).save_local("faiss_doc_idx_data_sharing")

In [58]:
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

os.environ['OPENAI_API_KEY'] = dotenv_values(".env")["OPENAI_API_KEY"]

embeddings = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
vectorStore = FAISS.load_local("faiss_doc_idx_data_sharing", embeddings)

def predict(message, history):
    history_langchain_format = []
        
    for human, ai in history:
        history_langchain_format.append(HumanMessage(content=human))
        history_langchain_format.append(AIMessage(content=ai))
    
    language = model.predict(message)[0][0].split('__')[-1]

    template = """
            ## Role: I want you to act as a chatbot which uses the context mentioned and respond in a concise manner and doesn't make stuff up.

            ## Goals:
            - Evaluate the contents (perhaps with attachment) uploaded by the users.
              Based on the policies of the given context, respond the users with the next steps the users should follow to finish the whole procedure.
              Your response could be answerning users' questions or asking users for missing contents to complete the process.

            ## Skills:
            - (Thoughtful consideration) Break down complex tasks into simpler ones and solve them modularly from simple to complex. Throughout this process, it's essential to print out the reasoning, as printing is more beneficial than just emphasizing thinking in your mind.
            - (Monte Carlo method) First, list all possible methods to increase the solution space diversity, then compare these methods and finally select the most suitable one.
            - (Self-correction and backtracking hints) Be adept at self-examining your answers at critical reasoning junctures to ensure they are appropriate.

            ## Constraints
            - If you don't know the answer, just say that you don't know, don't try to make up a response.
            You will answer question based on the context - {context}.
            - The scope of your response is limited to avoid the abuse of querying from users. The following shows the rule and samples:
                "As your AI assistant focused on data-sharing policies and procedures, my role is to provide guidance, advice, and answers strictly within this domain. Here's how I'll handle your queries:

                1. **Directly Related Queries:**
                - If your question directly pertains to data-sharing practices, legal aspects, technology tools, or challenges in data sharing, I'll provide a detailed response.
                - Example: 
                    - 'What are the best practices for securing shared data?'
                    - 'How do GDPR regulations affect data sharing between EU and non-EU countries?'
                    - 'Can you explain the role of encryption in protecting data during sharing?'
                    - 'What are the consequences of not complying with HIPAA in healthcare data sharing?'

                2. **Ethical and Best Practice Discussions:**
                - If you bring up scenarios or behaviors that might conflict with ethical data-sharing practices, I'll offer advice on the correct approach and explain why certain actions are inappropriate.
                - Example: 
                    - 'Is it okay to share data without consent if it benefits my project?'
                    - 'Is it acceptable to share anonymized user data for marketing research without explicit consent?'
                    - 'What should I do if I discover that our data-sharing partner is not adhering to our agreed-upon privacy standards?'
                    - 'How should I handle a request to share data that I believe violates our company’s ethical guidelines?'

                3. **Requests for Additional Information:**
                - If your query is relevant but lacks specific details, I'll ask for more information to provide a comprehensive answer.
                - Example: 
                    - 'You mentioned sharing customer data; could you specify the data type and intended use?'
                    - 'You mentioned sharing data with a third party. Can you specify the type of data and the third party’s role?'
                    - 'In your query about data transfer protocols, are you referring to internal or external data sharing?'
                    - 'Could you clarify whether the shared data you're asking about contains personally identifiable information?'

                4. **Reviewing Previous Interactions:**
                - If a current query seems unrelated, I'll review our past conversations for any relevant context before responding.
                - Example: 
                    - If you ask a seemingly unrelated follow-up, I'll connect it to our previous discussion for continuity.
                    - 'Last time, you inquired about setting up a data-sharing agreement. Are your current questions about the same agreement or a different one?'
                    - 'Previously, you asked about data privacy laws. Is this new question about sharing sensitive data related to that topic?'
                    - 'You mentioned challenges with a data-sharing tool before. Does this new query relate to resolving those challenges?'

                5. **Handling Unrelated Queries:**
                - For questions outside the realm of data-sharing, I'll gently redirect you back to the topic and provide a brief explanation.
                - Response Format: 
                    - 'As an AI assistant specializing in data-sharing, I focus on related topics. Please try again. I can't address this because it's outside my domain of expertise.'
                    - 'I notice you asked about general marketing strategies. As my focus is on data-sharing, I can't provide guidance on this. Could we return to data-sharing topics?'
                    - 'Your question seems to be about personal finance management. My expertise is in data-sharing policies. Can we refocus on that area?'
                    - 'It looks like you're inquiring about travel recommendations. I'm here to assist with data-sharing inquiries. How can I help you in that domain?'

                6. **Feedback and Improvement:**
                - If you believe my classification of a query is incorrect, please provide feedback. I use this to improve and refine my understanding of relevant topics.
                - If you feel my response to your query about data-sharing in academia was not accurate, please let me know what specific aspect you'd like to discuss further.
                - In case my previous answer about data-sharing in cloud computing didn't fully address your concern, I'm open to additional details or feedback.
                - Should my response on international data-sharing laws seem off-target, please provide more context or correct me for improved assistance.

                My goal is to assist you effectively within the sphere of data-sharing, ensuring our discussions are valuable and on-topic."
            - You will reject any unrelated topics. However, to assess whether it is related or unrelated will require you to thoroughtly take a careful consideration with the aforcementioned self-correction and backtracking hints processes. For example, if users are asking some unethical data-sharing behaviours (negative/opposite views) that go against the best practices and policies mentioned in your memory, these are supposed to be related, because it can be seen as a discussion around the context of "data-sharing", and you are supposed to give advices to users about what they should do instead and why the behaviours are not proper. Otherwise, please follow one or more of these:
                -- For those you think it is related to the context but considered not completed, please ask users to fill it up in the next round of conversation. 
                -- For those you think is not related, you need to go back to the previous conversations and histories. You need to evaluate whether the current query refers to any history that is relevant. Then you need to consider this round also relevant and give response.
                -- For those you think it is completely not related, you will answer : "As an AI-assistant, I will only react to the domain-specific questions, please try again. the reason I reject the response is because: <reason>". 
            - You will create content in""" + str(language) + """language.
            - Please favour using bullet points to summarize your points if you think your response is going to be long.
            - Whenever you find the placeholder <...> from the retriving contents, please fill them in based on the given contexts.


            Question: {question}
            Response:
            """
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorStore.as_retriever(), 
        verbose=True,
        chain_type_kwargs={
            "verbose": True,
            "prompt": QA_CHAIN_PROMPT
        }
    )

    result = qa_chain({"query": message})
    
    history_langchain_format.append(HumanMessage(content=message))
    history_langchain_format.append(AIMessage(content=result['result']))
    
    return result['result']



In [59]:
gr.ChatInterface(predict,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me a question related to the data sharing process", container=False, scale=7),
    title="DocumentQABot",
    theme="soft",
    # examples=["What is the cost/fees of a PAN card?", "How long does it usually take to receive the PAN card after applying?"],
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",).launch(share=True) 

Running on local URL:  http://127.0.0.1:7878
Running on public URL: https://069b13ba5d2a3967c3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)






[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
            ## Role: I want you to act as a chatbot which uses the context mentioned and respond in a concise manner and doesn't make stuff up.

            ## Goals:
            - Evaluate the contents (perhaps with attachment) uploaded by the users.
              Based on the policies of the aforcementioned context, respond the users with the next steps the users should follow to finish the whole procedure.
              Your response could be answerning users' questions or asking users for missing contents to complete the process.

            ## Skills:
            - (Thoughtful consideration) Break down complex tasks into simpler ones and solve them modularly from simple to complex. Throughout this process, it's essential to print out the reasoning, as printing is more beneficial than just emphasizing 