In [1]:
!pip install langchain faiss-cpu sentence-transformers openai --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [9]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
import torch
# Sample docs (replace with your corpus)
documents = [
    "Multi-agent systems coordinate AI models to solve tasks.",
    "FAISS is an open-source vector search library by Facebook.",
    "LangChain helps build language model pipelines easily.",
    "GPT4All and llama.cpp are open source local LLMs.",
]

# Load embedding model from sentence-transformers
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create FAISS index
faiss_index = FAISS.from_texts(documents, embeddings)


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load tokenizer and model
model_name = "gpt2"  # small model for demo; replace with a better open model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create text generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


Device set to use cpu


In [11]:
class RetrieverAgent:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def retrieve(self, query, k=3):
        return self.vectorstore.similarity_search(query, k=k)

class GeneratorAgent:
    def __init__(self, generator_pipeline):
        self.generator = generator_pipeline

    def generate_answer(self, context, question, max_length=150):
        prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
        generated = self.generator(prompt, max_length=max_length, do_sample=True, temperature=0.7)
        return generated[0]['generated_text'][len(prompt):].strip()

class SummarizerAgent:
    def __init__(self, generator_pipeline):
        self.generator = generator_pipeline

    def summarize(self, text, max_length=100):
        prompt = f"Summarize the following text briefly:\n{text}"
        generated = self.generator(prompt, max_length=max_length, do_sample=True, temperature=0.7)
        return generated[0]['generated_text'][len(prompt):].strip()



In [12]:
retriever_agent = RetrieverAgent(faiss_index)
generator_agent = GeneratorAgent(text_generator)
summarizer_agent = SummarizerAgent(text_generator)

def multi_agent_rag_system(query):
    retrieved_docs = retriever_agent.retrieve(query)
    combined_docs = " ".join([doc.page_content for doc in retrieved_docs])
    summary = summarizer_agent.summarize(combined_docs)
    answer = generator_agent.generate_answer(summary, query)
    return answer

# Test query
query = "What is a multi-agent RAG system?"
response = multi_agent_rag_system(query)
print("Answer:\n", response)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer:
 Multi-agent RAG is a language-based approach to machine learning that is applied to a wide range of tasks. Multi-agents, such as machine learning, are a means to create models for complex problems.

Question: What is a machine learning framework?

Answer: Machine learning frameworks are tools that can be used to construct models and algorithms based on existing knowledge.
