In [None]:
!pip install langchain langchain-community langchain-openai torch torchvision torchaudio pillow transformers sentencepiece accelerate bitsandbytes ragas

In [None]:
!pip install git+https://github.com/VikParuchuri/marker faiss-cpu nougat-ocr git+https://github.com/facebookresearch/nougat

In [None]:
!pip install --no-deps unstructured chardet langdetect python-iso639 unstructured-client

In [None]:
!pip install chromadb langchain-chroma

In [None]:
!pip install ipywidgets jupyterlab-widgets==2.0.0b0

In [None]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
model.eval()

In [None]:
def generate_image_details(markdown_path, model, tokenizer):
    markdown_folder_path = os.path.dirname(markdown_path)
    markdown_file_name = os.path.basename(markdown_path).split('.')[0]
    markdown_with_image = f'{markdown_folder_path}/{markdown_file_name}_with_image.md'

    with open(markdown_path, 'r') as read_file:
        with open(markdown_with_image, 'w') as write_file:
            chunk_size = 4000
            read_file_chunk = read_file.read(chunk_size)
            while len(read_file_chunk) > 0:
                image_pattern = r"(?:[!]\[(?P<caption>.*?)\])\((?P<image>.*?)\)"
                matches = re.finditer(image_pattern, read_file_chunk, re.MULTILINE)
                for match in matches:
                    print(match.group(1))
                    image = Image.open(f'{markdown_folder_path}/{match.group(1)}').convert('RGB')
                    question = read_file_chunk + ("\nBased from the above context explain in detail what the image is about? Please analyze the image thoroughly. The image may contain various forms of data representation such as charts, graphs, tables, etc."
                    "For charts and graphs, identify and describe the type, axes, labels, data points, trends, and any significant peaks, troughs, or patterns. "
                    "Highlight any anomalies or outliers and discuss their possible implications. Extract and report all key numerical values. "
                    "For other types of images, describe all visible elements and their relationships in detail. Provide a clear and precise summary of the key features,"
                    "interpret the data where applicable, and make note of any notable observations or ambiguities." 
                    "It is crucial to extract all key numerical values if present in the image.")
                    msgs = [{'role': 'user', 'content': question}]
                    response = model.chat(
                        image = image,
                        msgs = msgs,
                        tokenizer = tokenizer,
                        sampling = True,
                        temperature = 0.1,
                    )
                    print(response)
                    new_file_chunk = read_file_chunk[:match.end()] + f'\n{response}\n' + read_file_chunk[match.end():]
                    write_file.write(new_file_chunk)
                read_file_chunk = read_file.read(chunk_size)

In [None]:
import glob
pdfs = glob.glob('/kaggle/input/eval-files/*.pdf')
for file_path_pdf in pdfs:
    file_name = os.path.basename(file_path_pdf).split('.')[0]
    markdown_path = f'/kaggle/input/results/{file_name}/{file_name}.md'
    generate_image_details(markdown_path, model, tokenizer)

In [None]:
import os
from typing import List
from langchain_community.document_loaders import UnstructuredMarkdownLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser, BaseOutputParser
from langchain.retrievers import ParentDocumentRetriever, MultiQueryRetriever, ContextualCompressionRetriever, EnsembleRetriever
from langchain.storage import InMemoryStore
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline, EmbeddingsFilter
from marker.logger import configure_logging
from dotenv import load_dotenv
import re


load_dotenv()
configure_logging()

class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return lines

class RAG:
    def __init__(self, visual_model, tokenizer):
        self.embeddings_model : BaseLanguageModel = AzureOpenAIEmbeddings(
            api_key="",
            azure_endpoint="",
            azure_deployment="",
            openai_api_version=""
        )
        self.chat_model : BaseLanguageModel = AzureChatOpenAI(
            api_key="",
            azure_endpoint="",
            azure_deployment="",
            openai_api_version=""
        )
        self.visual_model = visual_model
        self.tokenizer = tokenizer

    def _load_chain(self):
        system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Answer to the point"
            "\n\n"
            "{context}\n"
        )

        prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                ("human", '{input}'),
            ]
        )
        
        def get_image_context(state):
            for doc in state['context']:
                text = doc.page_content
                image_pattern = r"(?:[!]\[(?P<caption>.*?)\])\((?P<image>.*?)\)"
                matches = re.finditer(image_pattern, text, re.MULTILINE)
                markdown_folder_path = os.path.dirname(doc.metadata['source'])
                question = state['input']
                for match in matches:
                    image = Image.open(f'{markdown_folder_path}/{match.group(1)}').convert('RGB')
                    question = text + (f'\nBased from the above context explain in detail and answer the question.',
                    f'\nQuestion : {question}\nElaborate on all the details regarding the image structure,'
                    'colours and all the text in the image.'
                    "Please analyze the image thoroughly. The image may contain various forms of data representation such as charts, graphs, tables, etc."
                    "For charts and graphs, identify and describe the type, axes, labels, data points, trends, and any significant peaks, troughs, or patterns. "
                    "Highlight any anomalies or outliers and discuss their possible implications. Extract and report all key numerical values. "
                    "For other types of images, describe all visible elements and their relationships in detail. Provide a clear and precise summary of the key features,"
                    "interpret the data where applicable, and make note of any notable observations or ambiguities." 
                    "It is crucial to extract all key numerical values if present in the image.")
                    msgs = [{'role': 'user', 'content': question}]
                    response = model.chat(
                        image = image,
                        msgs = msgs,
                        tokenizer = tokenizer,
                        sampling = True,
                        temperature = 0.1,
                    )
                    text = text + f'\n{match.group(1)} : {response}\n'
                doc.page_content = text
                print(doc.page_content)
            return state

        def inspect(state):
            if len(state['context']) > 3:
                state['context'] = state['context'][:4]
            print([document.metadata['source'] for document in state['context']])
            return state

        def convert_docs(state):
            state = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in state]
            return state

        self.chain = ({"input" : RunnablePassthrough(), 
                       "context" : (lambda x : x['input']) | self.retriever | RunnableLambda(convert_docs)}
        | RunnableLambda(inspect)
        | RunnableLambda(get_image_context)
        | prompt
        | self.chat_model
        | StrOutputParser())
   

    def load_image_markdown(self, file_path_pdf):
        file_name = os.path.basename(file_path_pdf).split('.')[0]
        markdown_path = f'/kaggle/input/results/{file_name}/{file_name}.md'
        markdown_folder_path = os.path.dirname(markdown_path)
        markdown_file_name = os.path.basename(markdown_path).split('.')[0]
        markdown_with_image = f'{markdown_folder_path}/{markdown_file_name}_with_image.md'
        loader = UnstructuredMarkdownLoader(markdown_with_image)
        return loader

    def load_marker_markdown(self, file_path_pdf):
        file_name = os.path.basename(file_path_pdf).split('.')[0]
        subfolder_path = f'/kaggle/input/eval-results/{file_name}'
        markdown_file_name = os.path.basename(subfolder_path)
        markdown_path = f'{subfolder_path}/{markdown_file_name}.md'
        loader = TextLoader(markdown_path)
        return loader

    def load_nougat_markdown(self, file_path_pdf):
        file_name = os.path.basename(file_path_pdf).split('.')[0]
        output_folder_path = f'/kaggle/input/eval-results/{file_name}'
        markdown_path = f'{output_folder_path}/{file_name}.mmd'
        loader = UnstructuredMarkdownLoader(markdown_path, mode="elements")
        return loader
    
    def create_vectorstore(self):
        self.vectorstore = Chroma(collection_name="split_parents", embedding_function=self.embeddings_model, persist_directory="/kaggle/working/chroma_chunks_1")            
        
    def create_chroma_retriever(self):
        return self.vectorstore.as_retriever()
    
    def create_parent_document_retriever(self):
        child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=400)
        return ParentDocumentRetriever(
            vectorstore=self.vectorstore,
            docstore=InMemoryStore(),
            child_splitter=child_splitter,
            parent_splitter=parent_splitter,
            search_kwags={'k' : 2}
        )
        

    def load_file(self, parent_document_retriever, faiss_retriever):
        query_prompt = PromptTemplate(
            input_variables=["question"],
            template="""You are an AI language model assistant. Your task is to generate 3
                        different versions of the given user question to retrieve relevant documents from a vector
                        database. By generating multiple perspectives on the user question, your goal is to help
                        the user overcome some of the limitations of the distance-based similarity search.
                        Provide these alternative questions separated by newlines.
                        Original question: {question}""",
        )
        output_parser = LineListOutputParser()
        multiqueryretriever = MultiQueryRetriever(
            retriever = faiss_retriever,
            llm_chain = (query_prompt | self.chat_model | output_parser),
            parser_key = "lines"
        )
        ensemble_retriever = EnsembleRetriever(
            retrievers=[multiqueryretriever,parent_document_retriever], weights=[0.4, 0.6]
        )
        redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings_model, similarity_threshold=0.85)
        relevant_filter = EmbeddingsFilter(embeddings=self.embeddings_model, similarity_threshold=0.53)
        pipeline_compressor = DocumentCompressorPipeline(
            transformers=[redundant_filter,relevant_filter]
        )
        self.retriever = ContextualCompressionRetriever(
            base_compressor=pipeline_compressor, base_retriever=ensemble_retriever
        )
        self._load_chain()


    def get_response(self, input):

        response = self.chain.invoke({"input" : input})
        context = [doc.page_content for doc in self.retriever.invoke(input)]
        return response, context

In [None]:
import glob
import torch 
from langchain_community.vectorstores import utils as chromautils
rag_instance = RAG(model, tokenizer)
rag_instance.create_vectorstore()
parent_document_retriever = rag_instance.create_parent_document_retriever()
chroma_retriever = rag_instance.create_chroma_retriever()
pdfs = glob.glob('/kaggle/input/eval-files/*.pdf')
print(pdfs)
for file_path_pdf in pdfs:
    loader_marker = rag_instance.load_marker_markdown(file_path_pdf)
    docs = chromautils.filter_complex_metadata(loader_marker.load())
    parent_document_retriever.add_documents(docs)
    
for file_path_pdf in pdfs:
    loader_nougat = rag_instance.load_nougat_markdown(file_path_pdf)
    docs = chromautils.filter_complex_metadata(loader_nougat.load())
    parent_document_retriever.add_documents(docs)

for file_path_pdf in pdfs:
    loader_image = rag_instance.load_image_markdown(file_path_pdf)
    docs = chromautils.filter_complex_metadata(loader_image.load())
    parent_document_retriever.add_documents(docs)
    
rag_instance.load_file(parent_document_retriever, chroma_retriever)

In [None]:
import pandas as pd
questions = pd.read_excel('/kaggle/input/eval-questions/Test_questions_IIT4th.xlsx')

answer = {'question' : [], 'answer' : [], 'context' : []}
for question in questions['Questions']:
    response, context = rag_instance.get_response(question)
    print(question)
    print(response)
    print(context)
    answer['question'].append(question)
    answer['answer'].append(response)
    answer['context'].append(context)

answer_dataframe = pd.DataFrame(answer)
answer_dataframe.to_csv('/kaggle/working/samarth-final.csv')

# SOME SAMPLE REPONSES FOR THE GIVEN QUESTIONS

In the Scaled Dot-Product Attention mechanism, when the queries (Q) and keys (K) are input, the flow involves the following steps:

1. **Input**: The input consists of queries and keys, which are vectors of dimension dk.
2. **Dot Products**: The attention mechanism computes the dot products of the query with all keys. This is a multiplication operation between the query and key vectors.
3. **Scaling**: Each dot product is then divided by the square root of dk. This scaling step is crucial as it helps in preventing the dot products from growing too large.
4. **Softmax Function**: After scaling, a softmax function is applied to obtain the weights on the values. The softmax function is used to normalize the weights so that they form a probability distribution over the possible outputs.

After these steps, the final output is computed as a weighted sum of the values, where the weight assigned to each value is determined by the compatibility function of the query with the corresponding key.

In the Scaled Dot-Product Attention mechanism, when the queries (Q) and keys (K) are input, the flow involves the following steps:

1. **Computing Q, K, V**: The input queries (Q) and keys (K) are used to compute the query and key matrices. These matrices are typically derived from the input data through some form of transformation or encoding.

2. **Computing Attention Weights**: The dot products of the query with all keys are computed, each is divided by √dk, and then a softmax function is applied to obtain the weights on the values. This step involves taking the dot product of the query and key matrices, dividing by the square root of the key dimension (dk), and applying a softmax function to ensure that the attention weights are non-negative and sum up to one.

3. **Computing Attention Output**: After computing the attention weights, the attention output is computed by multiplying the weight matrix with the value matrix, producing the final output of the attention mechanism.

This process results in the computation of the attention weights based on the input queries (Q) and keys (K), which are then used to derive the attention output.

In [None]:
pdf_names = [os.path.basename(file_path_pdf).split(sep='.')[0] for file_path_pdf in pdfs]
documents = []
for dir_name in pdf_names:
    markdowns = glob.glob(f'/kaggle/input/eval-results/{dir_name}/{dir_name}.md')
    math_markdowns = glob.glob(f'/kaggle/input/eval-results/{dir_name}/{dir_name}.mmd')
    markdowns.extend(math_markdowns)
    for markdown in markdowns:
        markdown_documents = UnstructuredMarkdownLoader(markdown, mode='elements')
        documents.extend(markdown_documents.load())
len(documents)

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
import nest_asyncio

nest_asyncio.apply()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
splits = text_splitter.split_documents(documents)
# generator with openai models
generator_llm = rag_instance.chat_model
embeddings = rag_instance.embeddings_model

generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=generator_llm,
    embeddings=embeddings,
)

testset = generator.generate_with_langchain_docs(splits, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [None]:
testset.to_pandas()

In [None]:
from datasets import Dataset

questions = testset.to_pandas()["question"].to_list()
ground_truth = testset.to_pandas()["ground_truth"].to_list()

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

for query in questions:
    data["question"].append(query)
    data["answer"].append(rag_instance.get_response(query))
    data["contexts"].append([doc.page_content for doc in rag_instance.retriever.invoke(query)])

dataset = Dataset.from_dict(data)

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
)

nest_asyncio.apply()

result = evaluate(
    dataset = dataset,
    llm=generator_llm,
    embeddings=embeddings,
    metrics=[
        context_relevancy,
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
        answer_similarity
    ],
)

In [None]:
result.to_pandas()