<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# !pip install faiss-cpu transformers torch

In [6]:
import faiss
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Step 1: Create some example documents
documents = [
    "The capital of France is Paris.",
    "The Great Wall of China is over 13,000 miles long.",
    "Python is a popular programming language.",
    "The Northern and Southern Dynasties lasted from 420 to 589 AD.",
    "Beijing is the capital of China."
]

# Step 2: Vectorize the documents using TF-IDF
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents).toarray()

# Step 3: Create a FAISS index for fast similarity search
dimension = doc_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_vectors)

# Step 4: Define a function to retrieve the most relevant document
def retrieve(query, k=1):
    query_vector = vectorizer.transform([query]).toarray()
    distances, indices = index.search(query_vector, k)
    return [documents[i] for i in indices[0]]

# Step 5: Load a pre-trained text generation model (e.g., GPT-2)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Step 6: Define the RAG function
def rag_generate(query):
    retrieved_docs = retrieve(query, k=1)
    context = " ".join(retrieved_docs)  # Combine the retrieved document(s)

    # Prepare the input for generation
    input_text = f"Context: {context}\nQuestion: {query}\nAnswer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate the response
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return answer

ModuleNotFoundError: No module named 'faiss'

In [None]:
# Step 7: Test the RAG implementation
# query = "What is the capital of China?"
query = "What is the capital of Spain?"

response = rag_generate(query)
print(response)


In [None]:
# using a new embedding

In [None]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load SBERT model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 2: Prepare example documents (knowledge base)
documents = [
    "The Eiffel Tower is located in Paris.",
    "The capital of Japan is Tokyo.",
    "Python is a programming language loved by data scientists.",
    "The Great Wall of China is a famous historical site.",
    "Mount Everest is the tallest mountain in the world."
]
doc_embeddings = model.encode(documents, convert_to_tensor=True)  # Precompute embeddings

# Step 3: RAG Function: Retrieve and Generate
def rag(query, top_k=1):
    # Encode query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding.cpu().numpy().reshape(1, -1), doc_embeddings.cpu().numpy())

    # Retrieve top-k relevant document(s)
    top_k_indices = np.argsort(similarities[0])[::-1][:top_k]
    retrieved_docs = [documents[idx] for idx in top_k_indices]

    # Combine retrieved documents with query for generation
    context = " ".join(retrieved_docs)
    return f"Context: {context}\nQuestion: {query}\nAnswer: [Your generation logic here]"

# Step 4: Test the RAG implementation
query = "Where is the Eiffel Tower?"
response = rag(query)
print(response)