In [1]:
#DISPLAY METADATA ALOMG WITH SOURCE DOCUMENT
import panel as pn
import param
import fitz  # PyMuPDF for PDF handling
from langchain_community.document_loaders import Docx2txtLoader
from langchain.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from llama_index.core import Document
from llama_index.llms.openai import OpenAI
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
import nest_asyncio

nest_asyncio.apply()

# Set up OpenAI API key
api_key = 'My_OpenAI_key'


# Helper function to normalize metadata values
def normalize_metadata(metadata):
    normalized_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, list):
            normalized_metadata[key] = ', '.join(map(str, value))
        elif isinstance(value, (str, int, float, bool)):
            normalized_metadata[key] = value
        else:
            normalized_metadata[key] = str(value)
    return normalized_metadata


# Define the helper function to load the database with metadata extraction
def load_db(file, file_type, chain_type, k):
    try:
        if file_type == 'pdf':
            loader = PyPDFLoader(file)
            documents = loader.load()
        elif file_type == 'csv':
            loader = CSVLoader(file_path=file)
            documents = loader.load()
        elif file_type == 'docx':
            loader = Docx2txtLoader(file)
            documents = loader.load()
        else:
            raise ValueError(f"Unsupported file type: {file_type}")

        combined_text = " ".join([doc.page_content for doc in documents])

        llm = OpenAI(model="gpt-3.5-turbo", temperature=0.2, api_key=api_key)
        entity_extractor = EntityExtractor(prediction_threshold=0.5, label_entities=False, device="cpu")
        qa_extractor = QuestionsAnsweredExtractor(questions=3, llm=llm)
        summary_extractor = SummaryExtractor(summaries=["prev", "self"], llm=llm)
        title_extractor = TitleExtractor(nodes=5, llm=llm)
        keyword_extractor = KeywordExtractor(keywords=10, llm=llm)
        node_parser = SentenceSplitter()

        transformations = [node_parser, title_extractor, entity_extractor, summary_extractor, qa_extractor, keyword_extractor]
        pipeline = IngestionPipeline(transformations=transformations)

        document = Document(text=combined_text, metadata={})
        nodes = pipeline.run(documents=[document])

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
        docs = text_splitter.split_documents(documents)

        for doc in docs:
            doc.metadata.update(normalize_metadata(nodes[0].metadata))

        embeddings = OpenAIEmbeddings(openai_api_key=api_key)
        db = Chroma.from_documents(docs, embeddings)
        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

        llm_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
        qa = ConversationalRetrievalChain.from_llm(
            llm=llm_model,
            chain_type=chain_type,
            retriever=retriever,
            return_source_documents=True,
            return_generated_question=True,
        )
        return qa
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    loaded_file = param.String("/Users/pradhikshasuresh/Documents/Python/Space.pdf")
    file_type = param.String("pdf")

    def __init__(self, **params):
        super(cbfs, self).__init__(**params)
        self.panels = []
        self.qa = load_db(self.loaded_file, self.file_type, "stuff", 4)

    def call_load_db(self, event):
        if file_input.value:
            with open("temp_file", "wb") as f:
                f.write(file_input.value)
            self.loaded_file = "temp_file"
            self.file_type = file_input.filename.split('.')[-1]
            self.qa = load_db(self.loaded_file, self.file_type, "stuff", 4)
            self.clr_history()
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            return pn.pane.Markdown(f"Loaded Default File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)

        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.append((query, result["answer"]))
        self.answer = result['answer']

        self.panels.append(
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
        )
        self.panels.append(
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, styles={'background-color': '#F6F6F6'}))
        )

        # Append source documents with metadata to the response
        for doc in result["source_documents"]:
            metadata_str = "\n".join([f"{key}: {value}" for key, value in doc.metadata.items()])
            self.panels.append(
                pn.Row('Source Document:', pn.pane.Markdown(f"**Content:**\n{doc.page_content}\n\n**Metadata:**\n{metadata_str}", width=600, styles={'background-color': '#E8E8E8'}))
            )

        inp.value = ''  # clears the input box
        return pn.WidgetBox(*self.panels, scroll=True)

    def clr_history(self, event=None):
        self.chat_history = []
        self.panels = []  # Clear the panels as well
        return pn.pane.Markdown("Chat history cleared.")


cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf,.csv,.docx')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_load.on_click(cb.call_load_db)
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput(placeholder='Enter text here…')

# Bind the convchain method to the input field
conversation = pn.bind(cb.convchain, inp)

pn.extension()
# Layout including the file input and load button
tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    conversation,
    pn.layout.Divider(),
)

tab2 = pn.Column(
    pn.Row(file_input, button_load),
    pn.layout.Divider(),
)

dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatBot')),
    pn.Tabs(('Conversation', tab1)),
    pn.Row(button_clearhistory),
    tab2
)

dashboard.servable()

100%|█████████████████████████████████████████████| 5/5 [00:01<00:00,  3.79it/s]


Extracting entities:   0%|          | 0/21 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 21/21 [00:09<00:00,  2.13it/s]
100%|███████████████████████████████████████████| 21/21 [00:11<00:00,  1.89it/s]
100%|███████████████████████████████████████████| 21/21 [00:05<00:00,  3.83it/s]
  warn_deprecated(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  warn_deprecated(
