In [2]:
#####GraphRAG
import os
import networkx as nx
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chardet
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from dotenv import load_dotenv
import pickle
import gzip
import numpy as np

os.chdir("/Users/kdk/Desktop/2024/project/storyteller/api")
load_dotenv()

from text_util import *


In [None]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

def load_file(file_path):
    try:
        encoding = detect_encoding(file_path)
        with open(file_path, 'r', encoding=encoding) as f:
            text = f.read()
        return text
    except Exception as e:
        print(f"Failed to load {file_path}: {e}")
        return None

class GraphRAGChatbot:
    def __init__(self, openai_api_key):
        self.embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
        self.chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0.7, openai_api_key=openai_api_key)
        self.topics = ['economics', 'science', 'law', 'social', 'environment', 'education', 'politics', 'culture']
        self.persona_graphs = {}
        self.build_graphs()

    def build_graphs(self):
        """Build graphs for each topic by reading txt files and creating connections."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=50,
        )
        base_dir = '/content/drive/MyDrive/ai papers/new2'

        for topic in self.topics:
            folder_path = os.path.join(base_dir, topic)
            documents = []
            if os.path.exists(folder_path):
                for filename in os.listdir(folder_path):
                    if filename.endswith('.txt'):
                        file_path = os.path.join(folder_path, filename)
                        text = load_file(file_path)
                        if text:
                            doc = Document(page_content=text, metadata={"source": file_path})
                            docs = text_splitter.split_documents([doc])
                            documents.extend(docs)

                graph = nx.Graph()
                for i, doc in enumerate(documents):
                    graph.add_node(i, content=doc.page_content, metadata=doc.metadata)

                embeddings =[self.embedding_model.embed_query(doc.page_content) for doc in documents]
                norms = [np.linalg.norm(embedding) for embedding in embeddings]
                print("construct graph", topic)
                for i in range(len(documents)):
                    for j in range(i + 1, len(documents)):
                        similarity = np.dot(embeddings[i], embeddings[j]) / (norms[i] * norms[j])
                        if similarity > 0.5:  # Threshold for similarity
                            graph.add_edge(i, j, weight=similarity)

                self.persona_graphs[topic] = graph
            else:
                print(f"Folder for topic '{topic}' does not exist.")

    def compute_similarity(self, text1, text2):
        """Compute semantic similarity using embeddings."""
        embedding1 = self.embedding_model.embed_query(text1)
        embedding2 = self.embedding_model.embed_query(text2)
        return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    def recommend_topics(self, query):
        """Use OpenAI API to recommend 5 most related topics based on the query."""
        prompt = (
            f"Given the query: '{query}', select the 5 most relevant topics from the following list:\n"
            f"{', '.join(self.topics)}\n"
            f"Provide only the list of topics separated by commas."
        )
        response = self.chat_model([HumanMessage(content=prompt)])
        recommended_topics = [topic.strip() for topic in response.content.split(',')]
        recommended_topics = [topic for topic in recommended_topics if topic in self.topics]
        return recommended_topics[:5]

    def answer_question(self, query, personas):
        """Answer questions using GraphRAG."""
        answers = {}
        for persona in personas:
            if persona in self.persona_graphs:
                graph = self.persona_graphs[persona]
                # Find relevant nodes based on the query
                relevant_nodes = self.retrieve_relevant_nodes(graph, query)

                # Aggregate content from relevant nodes
                context = "\n\n".join([graph.nodes[node]['content'] for node in relevant_nodes])

                # Generate answer
                prompt_template = PromptTemplate(
                    input_variables=["persona", "context", "question"],
                    template=(
                        "Please think from a {persona} perspective.\n\n"
                        "{context}\n\n"
                        "Question: {question}\n"
                        "Answer:"
                    )
                )
                chain = LLMChain(
                    llm=self.chat_model,
                    prompt=prompt_template
                )
                output = chain.run(
                    persona=persona,
                    context=context,
                    question=query
                )
                answers[persona] = output
            else:
                answers[persona] = f"No data available for persona '{persona}'."
        return answers

    def retrieve_relevant_nodes(self, graph, query):
        """Retrieve nodes from the graph that are most relevant to the query."""
        query_embedding = self.embedding_model.embed_query(query)
        similarities = {}
        for node, data in graph.nodes(data=True):
            node_embedding = self.embedding_model.embed_query(data['content'])
            similarity = np.dot(query_embedding, node_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(node_embedding))
            similarities[node] = similarity

        # Sort nodes by similarity and return the top N
        sorted_nodes = sorted(similarities, key=similarities.get, reverse=True)
        return sorted_nodes[:5]

    def debate_question(self, question, personas):
        """Simulate a debate between personas."""
        from langchain.prompts import PromptTemplate
        from langchain.chains import LLMChain

        # Retrieve context for each persona
        persona_contexts = {}
        for persona in personas:
            if persona in self.persona_graphs:
                #retriever = self.persona_graphs[persona].as_retriever()
                graph = self.persona_graphs[persona]
                #docs = retriever.get_relevant_documents(question)
                relevant_nodes = self.retrieve_relevant_nodes(graph, question)
                context = "\n\n".join([graph.nodes[node]['content'] for node in relevant_nodes])
                #context = "\n\n".join([doc.page_content for doc in docs])
                persona_contexts[persona] = context

            else:
                persona_contexts[persona] = ""

        # Initialize dialogue history
        dialogue_history = []
        max_turns = 10  # Each persona speaks up to 10 times
        total_exchanges = max_turns * len(personas)

        # Generate initial answers
        for persona in personas:
            prompt_template = PromptTemplate(
                input_variables=["persona", "context", "question"],
                template=(
                    "As a {persona}, based on the following   :\n\n"
                    "{context}\n\n"
                    "Answer the question: {question}\n"
                    "Your answer:"
                )
            )
            chain = LLMChain(
                llm=self.chat_model,
                prompt=prompt_template
            )
            response = chain.run(
                persona=persona,
                context=persona_contexts[persona],
                question=question
            )
            dialogue_history.append((persona, response))

        # Simulate debate
        for _ in range(max_turns):
            for persona in personas:
                # Compile dialogue history
                history_text = ""
                for speaker, utterance in dialogue_history[-6:]:  # Limit to last 6 exchanges
                    history_text += f"{speaker.capitalize()}: {utterance}\n"

                # Prepare prompt
                prompt_template = PromptTemplate(
                    input_variables=["persona", "context", "dialogue_history", "question"],
                    template=(
                        "As a {persona}, continue the following debate based on the context and previous dialogue.\n\n"
                        "Context:\n{context}\n\n"
                        "Dialogue history:\n{dialogue_history}\n"
                        "Question: {question}\n. Answer within 3 sentences."
                        "{persona}, your response:"
                    )
                )
                chain = LLMChain(
                    llm=self.chat_model,
                    prompt=prompt_template
                )
                response = chain.run(
                    persona=persona,
                    context=persona_contexts[persona],
                    dialogue_history=history_text,
                    question=question
                )
                dialogue_history.append((persona, response))

                # Check if total exchanges reached 30
                if len(dialogue_history) >= total_exchanges:
                    break
            if len(dialogue_history) >= total_exchanges:
                break

        # Display the dialogue
        print("\nDebate Transcript:\n")
        for speaker, utterance in dialogue_history:
            print(f"{speaker.capitalize()}: {utterance}\n")
    def recommend_personas(self, query, selected_topics):
        """Use LLM to recommend 3 most appropriate personas among selected topics."""
        prompt = (
            f"Given the query: '{query}' and the topics: {', '.join(selected_topics)}, "
            f"select the 3 most appropriate topics for answering the question. "
            f"Provide only the list of topics separated by commas."
        )
        response = self.chat_model([HumanMessage(content=prompt)])
        recommended_personas = [topic.strip().lower() for topic in response.content.split(',')]
        # Ensure valid topics from selected_topics
        recommended_personas = [topic for topic in recommended_personas if topic in selected_topics]
        return recommended_personas[:3]

    def run_chatbot(self):
        """Run the chatbot interaction loop."""
        print("Welcome to the GraphRAG Chatbot Service!")
        while True:
            print("\n1. Start a new discussion")
            print("\n2. Exit")
            print("\n3. Debate")

            choice = input("Select an option: ")

            if choice == "1":
                query = input("Enter the topic you want to discuss: ")
                recommended_topics = self.recommend_topics(query)
                if not recommended_topics:
                    print("No relevant topics found. Please try again.")
                    continue
                print(f"Recommended topics: {', '.join(recommended_topics)}")

                question = input("Enter your question: ")
                recommended_personas = self.recommend_personas(question, recommended_topics)
                if not recommended_personas:
                    print("No personas could be recommended based on your question.")
                    continue
                print(f"Personas selected to answer: {', '.join(recommended_personas)}")

                answers = self.answer_question(question, recommended_personas)
                for persona, answer in answers.items():
                    print(f"\nAnswer from '{persona}': {answer}")

            elif choice == "2":
                print("Exiting chatbot. Goodbye!")
                break

            elif choice == "3":
                question = input("Enter the question for the debate: ")
                recommended_topics = self.recommend_topics(question)
                if not recommended_topics:
                    print("No relevant topics found. Please try again.")
                    continue
                print(f"Recommended topics: {', '.join(recommended_topics)}")

                recommended_personas = self.recommend_personas(question, recommended_topics)
                if not recommended_personas:
                    print("No personas could be recommended based on your question.")
                    continue
                print(f"Personas selected for the debate: {', '.join(recommended_personas)}")

                self.debate_question(question, recommended_personas)
            else:
                print("Invalid choice. Please try again.")

In [9]:
import openai

f = gzip.open('./PersonaChatbot_graphRAG_debate_final.pickle','rb')
chatbot = pickle.load(f)

ModuleNotFoundError: No module named 'openai.api_resources'

In [None]:

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
from dotenv import load_dotenv
from text_util import *
import numpy as np
import networkx as nx
from dotenv import load_dotenv
from tqdm import tqdm

batch_size = 32
def build_graphs(topics, embedding_model):
    persona_graphs = {}
    base_dir = "./new2"

    for topic in topics:
        folder_path = os.path.join(base_dir, topic)
        if not os.path.exists(folder_path):
            pass
        print("doc", topic)
        documents = construct_document(folder_path)

        graph = nx.Graph()
        for i, doc in enumerate(documents):
            graph.add_node(i, content=doc.page_content, metadata=doc.metadata)
        print("embedding", topic)
        texts = [doc.page_content for doc in documents]
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = embedding_model.embed_documents(batch_texts)
            embeddings.extend(batch_embeddings)
        norms = [np.linalg.norm(embedding) for embedding in embeddings]

        print("construct graph", topic)
        for i in range(len(documents)):
            for j in range(i + 1, len(documents)):
                similarity = np.dot(embeddings[i], embeddings[j]) / (
                    norms[i] * norms[j]
                )
                if similarity > 0.5:  # Threshold for similarity
                    graph.add_edge(i, j, weight=similarity)

        persona_graphs[topic] = graph
    return persona_graphs

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
topics = ['economics', 'science', 'law', 'social', 'environment', 'education', 'politics', 'culture']

graphs = build_graphs(topics, embedding_model)
for topic in topics:
    nx.write_gpickle(graphs[topic], f'./graphrag/{topic}.gpickle')

doc economics
embedding economics


Embedding batches: 100%|██████████| 20/20 [00:25<00:00,  1.28s/it]


construct graph economics
doc science
embedding science


Embedding batches: 100%|██████████| 57/57 [01:18<00:00,  1.38s/it]


construct graph science
doc law
embedding law


Embedding batches: 100%|██████████| 19/19 [00:22<00:00,  1.16s/it]


construct graph law
doc social
embedding social


Embedding batches: 100%|██████████| 26/26 [00:41<00:00,  1.61s/it]


construct graph social
doc environment
embedding environment


Embedding batches: 100%|██████████| 58/58 [01:12<00:00,  1.25s/it]


construct graph environment
doc education
embedding education


Embedding batches: 100%|██████████| 8/8 [00:09<00:00,  1.15s/it]


construct graph education
doc politics
embedding politics


Embedding batches: 100%|██████████| 15/15 [00:17<00:00,  1.15s/it]


construct graph politics
doc culture
Failed to load ./new2/culture/The Lab’s scientific achievement in.txt: 'charmap' codec can't decode byte 0x9d in position 686: character maps to <undefined>
embedding culture


Embedding batches: 100%|██████████| 19/19 [00:24<00:00,  1.30s/it]


construct graph culture


AttributeError: module 'networkx' has no attribute 'write_gpickle'

In [19]:

with open(f"./graphrag/grahs.pkl", "wb") as f:
    pickle.dump(graphs, f)

In [None]:
import picle

pickle.load(open("./graphrag/graphs.pkl", "rb"))

NameError: name 'pickle' is not defined