In [None]:
import re
import os
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import gradio as gr
import pandas as pd
import ollama
from typing import List, Dict, Tuple, Union
import json
import tabula

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

class LLMServer:
    def __init__(self, model_name='gemma2:2b'):
        self.model_name = model_name
        self.client = ollama.Client()

    def generate_response(self, prompt):
        response = self.client.chat(model=self.model_name, messages=[
            {
                'role': 'user',
                'content': prompt,
            },
        ])
        return response['message']['content']

# Initialize the LLM server
llm_server = LLMServer()

def extract_names_from_bracketed_string(s):
    # Regular expression pattern to match terms inside square brackets and separated by commas
    pattern = r'\[([\w\s,]+)\]'
    
    # Find the content inside the square brackets
    match = re.search(pattern, s)
    
    if match:
        # Extract the content within the brackets
        content = match.group(1)
        # Split the content by commas and strip any extra whitespace
        names = [name.strip() for name in content.split(',')]
        return names
    return None

def extract_df(pdf_file):
    return tabula.read_pdf(pdf_file, pages='all', multiple_tables=True)
def check_code(prompt):
    # Regular expression patterns
    alphanumeric_pattern = r'\b[a-zA-Z0-9][0-9]{2,5}\b'
    
    # Generate the response from llm_server
    names_and_terms_matches = llm_server.generate_response(f"String:{prompt}If this string contains some specific names like people, disease or for procedure.Output names with separated by commas.Output:[name1,name2].give no explaination")
    print(names_and_terms_matches)
    names_and_terms_matches=extract_names_from_bracketed_string(names_and_terms_matches)
    # Find alphanumeric strings where the first character is an alphabet
    alphanumeric_matches = re.findall(alphanumeric_pattern, prompt)
    print(alphanumeric_matches)
    # Combine results if either is not empty
    result = []
    if alphanumeric_matches:
        result.extend(alphanumeric_matches)
        if names_and_terms_matches:
            result.extend(names_and_terms_matches)
    
    return names_and_terms_matches,', '.join(result) if result else None
# def check_code(question):
#     prompt = f"You are an AI classifier. Your job is to determine whether a given question contains a code. A 'code' is defined as any alphanumeric sequence that contains both letters and numbers, such as 'S9582' or 'G15168'. If the question contains a code, respond with the code present. If it does not contain a code, respond with 'no'. Here is the question: {question}"
#     answer = llm_server.generate_response(prompt).strip()
#     return answer if answer.lower() != "no" else None

def search_dataframe_for_code(dfs, codes):
    results = []
    for code in codes:
        for df in dfs:
            mask = df.map(lambda x: code.lower() in str(x).lower())
            result = df[mask.any(axis=1)]
            if not result.empty:
                results.append(result)
    return results if results else None

def extract_text_from_pdf(pdf_file: str) -> str:
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        return "\n".join(page.extract_text() for page in reader.pages)

def create_vectorstore(texts: str, folder_name: str) -> Chroma:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(texts)
    vectorstore = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings,
        persist_directory=os.path.join(folder_name, "chroma_db")
    )
    vectorstore.persist()
    return vectorstore

def process_pdf(pdf_file: str) -> Tuple[Chroma, List[pd.DataFrame]]:
    folder_name = "pdf_content"
    os.makedirs(folder_name, exist_ok=True)

    print("Extracting text from PDF...")
    all_text = extract_text_from_pdf(pdf_file)
    
    print("Extracting tables from PDF...")
    tables = extract_df(pdf_file)
    
    print("Creating vector store...")
    vectorstore = create_vectorstore(all_text, folder_name)
    
    return vectorstore, tables

def word_search(chunks: List[str], keywords: List[str], k: int = 5) -> List[str]:
    def count_keywords(chunk):
        return sum(keyword.lower() in chunk.lower() for keyword in keywords)
    
    sorted_chunks = sorted(chunks, key=count_keywords, reverse=True)
    return sorted_chunks[:k]

def extract_keywords(question: str) -> List[str]:
    prompt = f"Extract keywords from: {question}\nOutput format: [keyword1,keyword2,keyword3]"
    response = llm_server.generate_response(prompt)
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return question.split()
def get_dynamic_k(question: str) -> int:
    return min(5, max(1, len(question.split()) // 5))

def generate_multiple_queries(original_query: str) -> List[str]:
    prompt = f"""
    Given the following user query, generate 3 related queries that could help provide a more comprehensive answer.
    Original query: {original_query}
    
    Output format:
    1. [First related query]
    2. [Second related query]
    3. [Third related query]

    Give no explaination.
    """
    response = llm_server.generate_response(prompt)
    queries = [original_query]
    for line in response.split('\n'):
        if line.strip():
            parts = line.split('. ', 1)
            if len(parts) > 1:
                queries.append(parts[1].strip())
    return queries[:4]  # Limit to 4 queries (original + 3 generated)
def process_single_query(question: str, vectorstore, tables) -> Dict:

    keywords = extract_keywords(question)
    print(f"Keywords extracted: {keywords}")
    k = get_dynamic_k(question)
    retrieved_docs = unified_retrieval(vectorstore, question, keywords, k)
    
    context = "\n".join(retrieved_docs)
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer based on the context. If insufficient information, say 'I don't have enough information to answer this question.'"
    answer = llm_server.generate_response(prompt)
    
    # Determine confidence level
    
    return {
        "answer": answer,
        "keywords_used": keywords,
        "retrieved_documents": retrieved_docs,
        "source": "text",
    }

def unified_retrieval(vectorstore: Chroma, question: str, keywords: List[str], k: int = 5) -> List[str]:
    similarity_docs = vectorstore.similarity_search(question, k=k)
    all_chunks = [doc.page_content for doc in similarity_docs]
    
    keyword_docs = word_search(all_chunks, keywords, k=k)
    
    combined_docs = list(set(all_chunks + keyword_docs))
    
    # Simple ranking based on keyword presence and similarity order
    def rank_doc(doc):
        keyword_count = sum(keyword.lower() in doc.lower() for keyword in keywords)
        similarity_rank = all_chunks.index(doc) if doc in all_chunks else len(all_chunks)
        return (keyword_count, -similarity_rank)
    
    ranked_docs = sorted(combined_docs, key=rank_doc, reverse=True)
    return ranked_docs[:k]

def get_dynamic_k(question: str) -> int:
    return min(5, max(1, len(question.split()) // 5))

def process_code_results(question: str, code: str, search_results: List[pd.DataFrame]) -> str:
    results_str = "\n".join([df.to_string() for df in search_results])
    prompt = f"""
    Question: {question}
    Code: {code}
    
    The following table data was found for the code:
    
    {results_str}
    
    Based on this information, please provide an answer to the question. 
    If you need any clarification or if the information is insufficient, please state so.
    """
    return llm_server.generate_response(prompt)
def evaluate_answer_quality(question: str, answer: str) -> bool:
    print(answer)
    prompt = f"""
    Question: {question}
    Answer: {answer}

    Evaluate if the given answer addresses the question. 
    Output: Yes/No
    """
    evaluation = llm_server.generate_response(prompt).strip().lower()
    print(evaluation)
    return evaluation == 'yes'
def ask_question_interface(question: str, global_vectorstore, global_tables) -> str:
    if global_vectorstore is None:
        return json.dumps({"error": "Please upload and process a PDF first."}, indent=2)
    
    names,code = check_code(question)
    if code is not None and global_tables is not None:
        search_results = search_dataframe_for_code(global_tables, code)
        if search_results:
            answer = process_code_results(question, code, search_results)
            return json.dumps({"answer": answer}, indent=2)
    print(names)
    if names:
        search_results = search_dataframe_for_code(global_tables, names)
    keywords = extract_keywords(question)
    print(keywords)
    k = get_dynamic_k(question)
    retrieved_docs = unified_retrieval(global_vectorstore, question, keywords, k)
    print(retrieved_docs)
    context = "\n".join(retrieved_docs)
    results_str='none'
    if search_results:
     results_str = "\n".join([df.to_string() for df in search_results]) 
    print(context)
    prompt = f"some extra info:{results_str}Context: {context}\n\nQuestion: {question}\n\n Answer the question based on the context."
    answer = llm_server.generate_response(prompt)
    q=evaluate_answer_quality(question,answer)
    if q:
        output = {
        "answer": answer,
        "keywords_used": keywords,
        "retrieved_documents": retrieved_docs
       }
    else:
        # If not good enough, proceed with multiple queries approach
        queries = generate_multiple_queries(question)
        print(f"Generated queries: {queries}")
        
        results = []
        for query in queries:
            result = process_single_query(query, global_vectorstore, global_tables)
            results.append({"query": query, "result": result})
        
        combined_prompt = f"""
        Original question: {question}
        
        Here are the results from multiple related queries:
        
        {json.dumps(results, indent=2)}
        
        Please provide a comprehensive answer to the original question based on all these results.
        If there are conflicting information or insufficient data, please mention it.
        """
        
        final_answer = llm_server.generate_response(combined_prompt)
        
        output = {
            "original_question": question,
            "final_answer": final_answer,
            "approach": "multiple_queries",
            "individual_results": results
        }
    
    return json.dumps(output, indent=2)

global_vectorstore = None
global_tables = None

def process_pdf_interface(pdf_file: gr.File) -> str:
    global global_vectorstore, global_tables
    global_vectorstore, global_tables = process_pdf(pdf_file.name)
    return "PDF processed successfully!"

with gr.Blocks() as demo:
    gr.Markdown("# PDF Chatbot with LLM and Table Extraction")
    
    with gr.Tab("Process PDF"):
        pdf_file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        process_button = gr.Button("Process PDF")
        process_output = gr.Textbox(label="Processing Result")
        
        process_button.click(process_pdf_interface, inputs=pdf_file_input, outputs=process_output)
    
    with gr.Tab("Ask Questions"):
        question_input = gr.Textbox(label="Enter your question about the PDF")
        ask_button = gr.Button("Ask")
        answer_output = gr.JSON(label="Answer")
        
        ask_button.click(
            lambda q: ask_question_interface(q, global_vectorstore, global_tables),
            inputs=question_input,
            outputs=answer_output
        )

if __name__ == "__main__":
    demo.launch(share=True)