In [6]:
# To run locally and access project folder
import os
import sys

current_dir = os.getcwd()
base_dir = os.path.dirname(current_dir)
sys.path.append(base_dir)

In [11]:
%load_ext autoreload
%autoreload 2
from src.services.file_service import load_pdf, split_document
from src.services.vectordb_service import ChromaDB
from src.utils.util import pretty_print_docs
from config import TEMPLATE, MODEL, DOC_PATH
DOC_PATH = base_dir+'/'+DOC_PATH+'/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAI

import pprint

In [9]:
pprint.pp(TEMPLATE)

('Use ONLY the following pieces of context to answer the question at the end.\n'
 "If you don't know the answer, just say that you don't know, NEVER make up an "
 'answer.\n'
 'Summarize in bullet point format. Keep the answer as concise as possible.\n'
 '{context}\n'
 'Question: {question}\n'
 'Helpful Answer:')


In [12]:
pages = load_pdf(doc_path=DOC_PATH)

Loading file xLSTM_paper.pdf ...
File load correctly. Contains 55 pages


In [13]:

text_splits = split_document(pages, chunk_size=800, chunk_overlap=400, strategy='recursive')
chroma = ChromaDB(text_splits)
vectorstore = chroma.get_vectorstore()

Document spliting. Chunk size 800 - Chunk overlap 400 - Strategy recursive
Splits generated 364
Creating a local embedding vector DB on directory data/chroma
DB created successfuly. Collection count: 364 



In [14]:
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever()
)
#compressed_docs = compression_retriever.invoke(question)

In [15]:
def get_retriever(search_type):
    if search_type == 'compression':
        return compression_retriever
    else: 
        return vectorstore.as_retriever(search_type=search_type)

In [16]:
question = "What are the major improvments over the previous LSTM arquitecture?"
llm = ChatOpenAI(model_name=MODEL, temperature=0)

QA_CHAIN_PROMPT = PromptTemplate.from_template(TEMPLATE)


search_results = []

for search_type in ['similarity', 'mmr', 'compression']:
    s_type = {}
    retreiver = get_retriever(search_type)
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retreiver,
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )
    result = qa_chain.invoke({"query": question})
    
    s_type[search_type] = result
    search_results.append(s_type)

In [17]:
for search in search_results:
    for key in search.keys():
        print(str.upper(key))
        print(search[key]['result'])

SIMILARITY
- Exponential gating and novel memory structures
- Introduction of sLSTM with scalar memory, scalar update, and memory mixing
- Introduction of mLSTM with matrix memory and covariance update rule
- Fully parallelizable architecture
- Linear computation and constant memory complexity with respect to sequence length
- Compressive memory well suited for industrial applications and edge implementations
MMR
- Introduction of exponential gating with appropriate normalization and stabilization techniques
- Modification of LSTM memory structure to include sLSTM with scalar memory, scalar update, and new memory mixing, and mLSTM with matrix memory and covariance update rule
- Integration of these LSTM extensions into residual block backbones to create xLSTM blocks and architectures
COMPRESSION
The major improvements over the previous LSTM architecture are the introduction of exponential gating and novel memory structures, including the sLSTM with scalar memory and update, and the mLS

# Observaciones

La respuesta dada por Similarity search la entiendo como la mas correcta y completa de todas las generadas. A su vez deja de figurar como el resultado erroneo visto en chunks de 500.  
La arquitectura no es Fully parallelizable, unicamente el bloque mLSTM lo es. Pero el chunk tiene la informacion correcta, pienso que la pregunta al declarar el nombre de la nueva arquitectura no distingue entre xLSTM - mLSTM - sLSTM.

El metodo de Compression por alguna razon ignora la indicacion de responder en bullet points. A su vez si bien la respuesta no es incorrecta, parecen faltar detalles importantes como la arquitectura parallelizable y el orden de ejecucion lineal con respecto a la entrada. Por lo que comprimir el contexto si se busca tener mayor detalles no parece ser la mejor opcion.

In [18]:
for search in search_results:
    for key in search.keys():
        print(str.upper(key))
        pretty_print_docs(search[key]['source_documents'])
        print()

SIMILARITY
Document 1 page 0 from /Users/ropereira/Desktop/Projects/doc_questions_rag/data/raw//xLSTM_paper.pdf:

core marked the dawn of a new era, outpacing LSTMs at scale. We now raise a
simple question: How far do we get in language modeling when scaling LSTMs to
billions of parameters, leveraging the latest techniques from modern LLMs, but
mitigating known limitations of LSTMs? Firstly, we introduce exponential gating
with appropriate normalization and stabilization techniques. Secondly, we modify
the LSTM memory structure, obtaining: (i) sLSTM with a scalar memory, a scalar
update, and new memory mixing, (ii) mLSTM that is fully parallelizable with a
matrix memory and a covariance update rule. Integrating these LSTM extensions
into residual block backbones yields xLSTM blocks that are then residually stacked
into xLSTM architectures. Exponential gating and modified memory structures
--------------------------------------------------------------------------------------------------