In [8]:
import pandas as pd
from io import StringIO
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, load_summarize_chain
from langchain.docstore.document import Document
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def process_file_content(file_content):
    df = pd.read_csv(StringIO(file_content))
    return df

def initialize_text_splitter():
    return RecursiveCharacterTextSplitter(
        chunk_size=8000,
        chunk_overlap=800,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

def initialize_embeddings():
    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {"device": "cuda"}
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        model_kwargs=model_kwargs
    )
    return embeddings

def create_vector_store(texts, embeddings):
    vectorstore = FAISS.from_texts(texts, embeddings)
    vectorstore.save_local("faiss_index_")
    return vectorstore

def initialize_llm(model_name, context_length):
    return Ollama(model=model_name, num_ctx=context_length)

def create_enhanced_rag_chain(vector_store, llm):
    prompt_template = """
    You are an AI assistant tasked with answering questions about a dataset. 
    Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    Context: {context}

    Human: {question}
    AI Assistant: Let's approach this step-by-step:
    """
    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    # Use ContextualCompressionRetriever for improved retrieval
    compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=vector_store.as_retriever(search_kwargs={"k": 5})
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=compression_retriever,
        chain_type_kwargs={"prompt": PROMPT}
    )

    return qa_chain

def summarize_text(llm, text):
    docs = [Document(page_content=text)]
    summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
    return summarize_chain.run(docs)

def process_dataframe(df):
    # Extract key features using TF-IDF
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    tfidf = TfidfVectorizer(max_features=100)
    tfidf_matrix = tfidf.fit_transform(df.astype(str).sum(axis=1))
    feature_names = tfidf.get_feature_names_out()
    
    # Create a summary of key features
    feature_summary = " ".join(feature_names)
    
    # Combine with regular string representation
    return f"Key features: {feature_summary}\n\nFull data:\n{df.to_string()}"

def setup_enhanced_rag_system(df, model1="llama3.1", model2="phi3.5", model3='llama31_storm', context_length=8192):
    raw_text = process_dataframe(df)
    text_splitter = initialize_text_splitter()
    texts = text_splitter.split_text(raw_text)

    llm1 = initialize_llm(model1, context_length)
    dataset_summary = summarize_text(llm1, raw_text)

    texts.insert(0, dataset_summary)

    embeddings = initialize_embeddings()
    vector_store = create_vector_store(texts, embeddings)

    llm1 = initialize_llm(model1, context_length)
    llm2 = initialize_llm(model2, context_length)
    llm3 = initialize_llm(model3, context_length)

    qa_chain1 = create_enhanced_rag_chain(vector_store, llm1)
    qa_chain2 = create_enhanced_rag_chain(vector_store, llm2)
    qa_chain3 = create_enhanced_rag_chain(vector_store, llm3)

    return qa_chain1, qa_chain2, qa_chain3, dataset_summary, model1, model2, model3

def compare_rag_results(qa_chain1, qa_chain2, qa_chain3, query, dataset_summary, model1, model2, model3):
    enhanced_query = f"Dataset summary: {dataset_summary}\n\nQuery: {query}"
    
    result1 = qa_chain1({"query": enhanced_query})
    result2 = qa_chain2({"query": enhanced_query})
    result3 = qa_chain3({"query": enhanced_query})

    print(f"Query: {query}\n")
    print(f"Result from {model1}:\n{result1['result']}\n")
    print(f"Result from {model2}:\n{result2['result']}\n")
    print(f"Result from {model3}:\n{result3['result']}\n")

if __name__ == "__main__":
    file = pd.read_csv("db.csv", encoding='unicode_escape')
    
    qa_chain1, qa_chain2, qa_chain3, dataset_summary, model1, model2, model3 = setup_enhanced_rag_system(file)

    query = """What is highest grossing worldwide movie for DC and Marvel? 
    - Make sure you go through all rows
    - Don't forget to compare all movies within DC and Marvel
    - Before you answer make sure you cross check the answer
    """
    compare_rag_results(qa_chain1, qa_chain2, qa_chain3, query, dataset_summary, model1, model2, model3)

  return summarize_chain.run(docs)
  result1 = qa_chain1({"query": enhanced_query})


Query: What is highest grossing worldwide movie for DC and Marvel? 
    - Make sure you go through all rows
    - Don't forget to compare all movies within DC and Marvel
    - Before you answer make sure you cross check the answer
    

Result from llama3.1:
To find the highest grossing worldwide movie for both DC and Marvel, I'll go through each row of the data and identify the movies from each company.

**DC Movies:**

After reviewing the data, I found 7 movies from DC:

1. The Dark Knight (Row 26)
2. The Dark Knight Rises (Row 27)
3. Man of Steel (Row 28)
4. Batman v Superman: Dawn of Justice (Row 29)
5. Wonder Woman (Row 30)
6. Justice League (Row 31)
7. Joker (Row 38)

**Marvel Movies:**

After reviewing the data, I found 32 movies from Marvel:

1. Iron Man (Row 0)
2. The Incredible Hulk (Row 1)
3. Iron Man 2 (Row 2)
4. ... all the way to ...
37. Spider-Man: Far From Home (Row 36)

**Highest Grossing Worldwide Movie for DC:**

After comparing the worldwide gross values, I found th