In [None]:
!pip install langchain pypdf2 faiss-cpu sentence-transformers
!pip install openai langchain faiss-cpu
!pip install fitz
!pip install pymupdf
!pip install openai==0.28
!pip install --upgrade langchain langchain-openai openai
!pip install -U langchain-community
!pip install pypdf
!pip install tiktoken
!pip install pdfplumber


Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2, faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1 pypdf2-3.0.1
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9.tar.gz (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m3.2 MB/s[0m eta [

Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting pymupdf
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PDFPlumberLoader,PDFMinerLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import os
import numpy as np
from langchain.schema import HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter

class PersonaChatbot:
    def __init__(self, openai_api_key):
        self.general_vectordb = None
        self.persona_vectordbs = {}
        self.embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
        self.chat_model = ChatOpenAI(model="gpt-4", temperature=0.7, openai_api_key=openai_api_key)

    def build_general_vectordb(self, pdf_paths):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,       # 각 청크의 최대 문자 수
            chunk_overlap=50,     # 청크 간 중복 문자 수
        )
        """Create a VectorDB for general AI knowledge from provided PDF files."""
        documents = []
        for pdf_path in pdf_paths:
            #loader = PyPDFLoader(pdf_path)
            loader = PDFMinerLoader(pdf_path)
            documents.extend(loader.load_and_split())
        self.general_vectordb = FAISS.from_documents(documents, self.embedding_model)

    def add_persona(self, persona_name, pdf_path):
        """Add a new persona with an associated PDF."""
        #loader = PyPDFLoader(pdf_path)
        loader = PDFMinerLoader(pdf_path)
        documents = loader.load_and_split()
        persona_vectordb = FAISS.from_documents(documents, self.embedding_model)
        self.persona_vectordbs[persona_name] = persona_vectordb

    def cosine_similarity(self,vec1, vec2):
        """Calculate the cosine similarity between two vectors."""
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def query_persona(self, persona_name, query):
        """Query a specific persona and integrate with general knowledge."""
        if persona_name not in self.persona_vectordbs:
            raise ValueError(f"Persona '{persona_name}' does not exist.")

        persona_vectordb = self.persona_vectordbs[persona_name]

        # Embed the query
        query_embedding = self.embedding_model.embed_query(query)

        # General knowledge retrieval
        general_retriever = self.general_vectordb.as_retriever()
        general_chain = RetrievalQA.from_chain_type(
            llm=self.chat_model,
            retriever=general_retriever,
            return_source_documents=True
        )

        general_output = general_chain.invoke({"query": query})
        general_result = general_output["result"]

        # Persona-specific retrieval
        persona_retriever = persona_vectordb.as_retriever()
        persona_chain = RetrievalQA.from_chain_type(
            llm=self.chat_model,
            retriever=persona_retriever,
            return_source_documents=True
        )
        persona_output = persona_chain.invoke({"query": query})
        persona_result = persona_output["result"]

        # Embed the results for similarity comparison
        general_embedding = self.embedding_model.embed_query(general_result)
        persona_embedding = self.embedding_model.embed_query(persona_result)

        # Calculate similarity scores
        general_similarity = self.cosine_similarity(query_embedding, general_embedding)
        persona_similarity = self.cosine_similarity(query_embedding, persona_embedding)
        print('general_similarity:',general_similarity)
        print('persona_similarity:',persona_similarity)
        # Select the more relevant result
        if general_similarity >= persona_similarity:
            selected_result = general_result
            source = "General AI Knowledge"
            print(source)
        else:
            selected_result = persona_result
            source = f"Persona: {persona_name}"
            print(source)

        # Generate the final prompt
        combined_prompt = (
            f"You have access to the following source:\n\n"
            f"{source}:\n{selected_result}\n\n"
            f"Using this information, answer the following query:\n{query}"
        )
        # Wrap the prompt in a HumanMessage
        response = self.chat_model([HumanMessage(content=combined_prompt)])
        return response.content


    def run_chatbot(self):
        """Run the chatbot interaction loop."""
        print("Welcome to the Persona Chatbot Service!")
        while True:
            print("\n1. Add Persona")
            print("2. Query Persona")
            print("3. Exit")
            choice = input("Select an option: ")

            if choice == "1":
                persona_name = input("Enter Persona name: ")
                pdf_path = input("Enter PDF path for the Persona (uploaded in Colab): ")
                if os.path.exists(pdf_path):
                    self.add_persona(persona_name, pdf_path)
                    print(f"Persona '{persona_name}' added successfully!")
                else:
                    print("Invalid PDF path. Please try again.")

            elif choice == "2":
                persona_name = input("Enter Persona name: ")
                query = input("Enter your question: ")
                try:
                    answer = self.query_persona(persona_name, query)
                    print(f"\nAnswer from '{persona_name}': {answer}")
                except ValueError as e:
                    print(e)

            elif choice == "3":
                print("Exiting chatbot. Goodbye!")
                break
            else:
                print("Invalid choice. Please try again.")


# Main function for Colab
if __name__ == "__main__":
    # Step 1: Provide OpenAI API Key
    from getpass import getpass
    #openai_api_key = getpass("Enter your OpenAI API key: ")
    openai_api_key='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'
    # Initialize the chatbot
    chatbot = PersonaChatbot(openai_api_key=openai_api_key)

    # Step 2: Build general knowledge VectorDB
    print("Provide paths to general knowledge PDFs (uploaded in Colab, comma-separated):")
    # pdf_paths = input().split(",")
    pdf_directory = '/content/drive/MyDrive/ai papers/general'#2

    valid_paths = [
        os.path.join(pdf_directory, filename)
        for filename in os.listdir(pdf_directory)
        if filename.endswith('.pdf') and os.path.isfile(os.path.join(pdf_directory, filename))
    ]
    if valid_paths:
        chatbot.build_general_vectordb(valid_paths)
        print("General knowledge VectorDB and Knowledge Graph built successfully!")
    else:
        print("No valid paths provided. Exiting.")
        exit()

    # Step 3: Run chatbot interaction loop
    chatbot.run_chatbot()


Provide paths to general knowledge PDFs (uploaded in Colab, comma-separated):
General knowledge VectorDB and Knowledge Graph built successfully!
Welcome to the Persona Chatbot Service!

1. Add Persona
2. Query Persona
3. Exit
Select an option: 1
Enter Persona name: sy
Enter PDF path for the Persona (uploaded in Colab): /content/drive/MyDrive/ai papers/persona/2006.10726v3 (1).pdf
Persona 'sy' added successfully!

1. Add Persona
2. Query Persona
3. Exit
Select an option: 2
Enter Persona name: sy
Enter your question: what is diffusion?
general_similarity: 0.8255927750435581
persona_similarity: 0.8881068911583554
Persona: sy

Answer from 'sy': Diffusion is a process where particles spread out from an area of high concentration to an area of low concentration. This movement occurs naturally over time and continues until the concentration of particles is equal in all areas. It plays a significant role in many physical and biological phenomena, such as the spreading of a scent in a room or t

KeyboardInterrupt: Interrupted by user