In [1]:
!pip install datasets sentence-transformers faiss-cpu transformers accelerate streamlit
!pip install -q sentence-transformers





In [2]:
from datasets import load_dataset

# Load the Abirate/english_quotes dataset
dataset = load_dataset("Abirate/english_quotes")

# Inspect dataset info and first example
print(dataset)
print(dataset['train'][0])


DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})
{'quote': '“Be yourself; everyone else is already taken.”', 'author': 'Oscar Wilde', 'tags': ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']}


In [3]:
import pandas as pd

# Convert to pandas dataframe for easy manipulation
df = pd.DataFrame(dataset['train'])

# Drop rows with missing 'quote' or 'author'
df = df.dropna(subset=['quote', 'author']).reset_index(drop=True)

# Lowercase quotes
df['quote'] = df['quote'].str.lower()

# Normalize tags (make lowercase, handle missing)
df['tags'] = df['tags'].apply(lambda x: [tag.lower() for tag in x] if x else [])

# Preview cleaned data
df.head()


Unnamed: 0,quote,author,tags
0,“be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“i'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“so many books, so little time.”",Frank Zappa,"[books, humor]"
4,“a room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"


In [4]:
# Combine quote, author, and tags into one text string per sample
texts = [
    f"{row['quote']} - {row['author']} [tags: {', '.join(row['tags'])}]"
    for _, row in df.iterrows()
]

print("Example formatted text:\n", texts[0])



Example formatted text:
 “be yourself; everyone else is already taken.” - Oscar Wilde [tags: be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator]


In [6]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Load base model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare training data with pairs (anchor + positive = same text)
train_examples = [InputExample(texts=[text, text]) for text in texts]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune (1 epoch recommended for demo; increase if needed)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
)

# Save model after fine-tuning
model.save("fine_tuned_quote_embedding_model")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


In [7]:
import numpy as np
import faiss

# Load fine-tuned model
model = SentenceTransformer("fine_tuned_quote_embedding_model")

# Generate embeddings for all quote texts
corpus_embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings for cosine similarity
faiss.normalize_L2(corpus_embeddings)

# Initialize FAISS index
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
index.add(corpus_embeddings)

print(f"FAISS index has {index.ntotal} vectors")


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

FAISS index has 2508 vectors


In [6]:
# Install required packages if not already installed
# !pip install datasets sentence-transformers faiss-cpu transformers accelerate streamlit

from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
import pickle
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# ----------- 1. Load and preprocess dataset -----------

dataset = load_dataset("Abirate/english_quotes")

df = pd.DataFrame(dataset['train'])

# Drop rows with missing quote or author
df = df.dropna(subset=['quote', 'author']).reset_index(drop=True)

# Lowercase quotes and tags
df['quote'] = df['quote'].str.lower()
df['tags'] = df['tags'].apply(lambda x: [tag.lower() for tag in x] if x else [])

# Format texts as: "quote - author [tags: tag1, tag2]"
texts = [
    f"{row['quote']} - {row['author']} [tags: {', '.join(row['tags'])}]"
    for _, row in df.iterrows()
]

print("Sample formatted text:\n", texts[0])

# ----------- 2. Fine-tune SentenceTransformer model -----------

# Load base model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare training examples (using pairs of same text for demo)
train_examples = [InputExample(texts=[text, text]) for text in texts]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune model (1 epoch, increase as needed)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
)

# Save the fine-tuned model locally
model.save("fine_tuned_quote_embedding_model")

# ----------- 3. Build FAISS index -----------

# Reload fine-tuned model from local folder (optional but recommended)
model = SentenceTransformer("./fine_tuned_quote_embedding_model")

# Generate embeddings for all texts
corpus_embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings and create FAISS index
faiss.normalize_L2(corpus_embeddings)
dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(corpus_embeddings)

# Save embeddings and texts for reuse
with open("texts.pkl", "wb") as f:
    pickle.dump(texts, f)
np.save("corpus_embeddings.npy", corpus_embeddings)

# ----------- 4. Load generation model and tokenizer -----------

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
llm = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

# ----------- 5. Define retrieval + generation function -----------

def retrieve_and_generate(query, model, index, texts, tokenizer, llm, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_k)
    retrieved_texts = [texts[i] for i in indices[0]]
    context = "\n".join(retrieved_texts)
    prompt = f"Use the following quotes to answer the query:\n{context}\n\nQuery: {query}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = llm.generate(**inputs, max_length=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer, retrieved_texts, distances[0]

# ----------- 6. Test retrieval and generation -----------

query = "quotes about hope by Oscar Wilde"

answer, retrieved_quotes, scores = retrieve_and_generate(
    query, model, index, texts, tokenizer, llm
)

print("Generated answer:\n", answer)
print("\nTop retrieved quotes:")
for i, (quote, score) in enumerate(zip(retrieved_quotes, scores)):
    print(f"{i+1}. {quote} (Similarity: {score:.3f})")

# ----------- 7. Manual RAG evaluation -----------

test_queries = [
    {"query": "quotes about hope by Oscar Wilde", "expected_author": "oscar wilde"},
    {"query": "quotes about love and friendship", "expected_tags": ["love", "friendship"]},
    {"query": "motivational quotes on success", "expected_tags": ["success", "motivation"]},
    {"query": "funny quotes by Mark Twain", "expected_author": "mark twain"},
    {"query": "inspirational quotes on courage by women authors", "expected_tags": ["courage"], "expected_author": "woman"},
]

def evaluate_rag(test_queries, model, index, texts, tokenizer, llm, top_k=5):
    results = []
    for item in test_queries:
        query = item["query"]
        answer, retrieved_quotes, scores = retrieve_and_generate(query, model, index, texts, tokenizer, llm, top_k=top_k)

        relevant_count = 0
        expected_author = item.get("expected_author", "").lower()
        expected_tags = [tag.lower() for tag in item.get("expected_tags", [])]

        for quote_text in retrieved_quotes:
            quote_text_lower = quote_text.lower()
            if expected_author and expected_author in quote_text_lower:
                relevant_count += 1
                continue
            if expected_tags:
                start = quote_text_lower.find("[tags:")
                if start != -1:
                    tags_str = quote_text_lower[start:]
                    for tag in expected_tags:
                        if tag in tags_str:
                            relevant_count += 1
                            break

        precision_at_k = relevant_count / top_k
        results.append({"query": query, "precision_at_k": precision_at_k, "answer": answer})

    return results

# Run evaluation
evaluation_results = evaluate_rag(test_queries, model, index, texts, tokenizer, llm, top_k=5)

for res in evaluation_results:
    print(f"Query: {res['query']}")
    print(f"Precision@5: {res['precision_at_k']:.2f}")
    print(f"Generated Answer: {res['answer']}\n")


Sample formatted text:
 “be yourself; everyone else is already taken.” - Oscar Wilde [tags: be-yourself, gilbert-perreira, honesty, inspirational, misattributed-oscar-wilde, quote-investigator]


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Generated answer:
 Oscar Wilde [tags: love] “always forgive your enemies; nothing annoys them so much.”

Top retrieved quotes:
1. “some cause happiness wherever they go; others whenever they go.” - Oscar Wilde (attributed to) [tags: classic-insult] (Similarity: 0.551)
2. “selfishness is not living as one wishes to live, it is asking others to live as one wishes to live.” - Oscar Wilde [tags: selfishness] (Similarity: 0.531)
3. “who, being loved, is poor?” - Oscar Wilde [tags: love] (Similarity: 0.527)
4. “always forgive your enemies; nothing annoys them so much.” - Oscar Wilde [tags: enemies, forgiveness, strategy] (Similarity: 0.503)
5. “you will always be fond of me. i represent to you all the sins you never had the courage to commit.” - Oscar Wilde, [tags: sin] (Similarity: 0.495)
Query: quotes about hope by Oscar Wilde
Precision@5: 1.00
Generated Answer: Oscar Wilde [tags: love] “always forgive your enemies; nothing annoys them so much.”

Query: quotes about love and friendship
Pre

In [7]:
import json

# Your list of test queries, same as used in evaluation
test_queries = [
    {"query": "quotes about hope by Oscar Wilde", "expected_author": "oscar wilde"},
    {"query": "quotes about love and friendship", "expected_tags": ["love", "friendship"]},
    {"query": "motivational quotes on success", "expected_tags": ["success", "motivation"]},
    {"query": "funny quotes by Mark Twain", "expected_author": "mark twain"},
    {"query": "inspirational quotes on courage by women authors", "expected_tags": ["courage"], "expected_author": "woman"},
]

results_for_export = []

for item in test_queries:
    query_text = item["query"]
    answer, retrieved_quotes, scores = retrieve_and_generate(
        query_text, model, index, texts, tokenizer, llm, top_k=5
    )

    record = {
        "query": query_text,
        "expected_author": item.get("expected_author", None),
        "expected_tags": item.get("expected_tags", []),
        "generated_answer": answer,
        "retrieved_quotes": retrieved_quotes,
        "similarity_scores": scores.tolist() if hasattr(scores, "tolist") else list(scores),
    }

    results_for_export.append(record)

with open("rag_evaluation_export.json", "w", encoding="utf-8") as f:
    json.dump(results_for_export, f, indent=4, ensure_ascii=False)

print("RAG evaluation data exported successfully to 'rag_evaluation_export.json'")


RAG evaluation data exported successfully to 'rag_evaluation_export.json'


In [25]:
from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.metrics import ContextPrecision, ContextRecall

# Dummy LLM to avoid API calls (same as before)
class DummyResponse:
    def __init__(self, text):
        self.generations = [{'text': text}]

class DummyLLM:
    async def generate(self, prompt, **kwargs):
        return DummyResponse("dummy response")
    def set_run_config(self, run_config): pass
    def reset(self): pass
    def get_name(self): return "dummy-llm"

dummy_llm = DummyLLM()

# Create samples with list for retrieved_contexts (Pydantic happy)
samples = []
for item in test_queries:
    query = item['query']
    answer, retrieved_quotes, _ = retrieve_and_generate(query, model, index, texts, tokenizer, llm)

    reference_text = "\n".join(retrieved_quotes)

    sample = SingleTurnSample(
        user_input=query,
        response=answer,
        reference=reference_text,
        retrieved_contexts=retrieved_quotes  # list here
    )
    samples.append(sample)

# Subclass EvaluationDataset to monkey-patch retrieved_contexts to dict at runtime
class CustomEvaluationDataset(EvaluationDataset):
    def __getitem__(self, idx):
        sample = super().__getitem__(idx)
        # Convert list to dict with string keys before metrics consume it
        if isinstance(sample.retrieved_contexts, list):
            sample.retrieved_contexts = {str(i): v for i, v in enumerate(sample.retrieved_contexts)}
        return sample

dataset = CustomEvaluationDataset(samples)

metrics = [ContextPrecision(), ContextRecall()]

results = evaluate(dataset, metrics=metrics, llm=dummy_llm)

for metric in metrics:
    scores = results[metric.name]
    avg_score = sum(scores) / len(scores) if scores else 0
    print(f"{metric.name}: {avg_score:.3f}")


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Exception raised in Job[0]: KeyError(0)
Exception raised in Job[1]: KeyError(0)
Exception raised in Job[2]: KeyError(0)
Exception raised in Job[3]: KeyError(0)
Exception raised in Job[4]: KeyError(0)
Exception raised in Job[5]: KeyError(0)
Exception raised in Job[6]: KeyError(0)
Exception raised in Job[7]: KeyError(0)
Exception raised in Job[8]: KeyError(0)
Exception raised in Job[9]: KeyError(0)


context_precision: nan
context_recall: nan


In [9]:
import ragas
print("RAGAS version:", ragas.__version__)
print("Available attributes in ragas module:")
print(dir(ragas))


RAGAS version: 0.2.15
Available attributes in ragas module:
['CacheInterface', 'DiskCacheBackend', 'EvaluationDataset', 'MultiTurnSample', 'RunConfig', 'SingleTurnSample', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_analytics', '_version', 'cache', 'cacher', 'callbacks', 'cost', 'dataset_schema', 'embeddings', 'evaluate', 'evaluation', 'exceptions', 'executor', 'integrations', 'llms', 'losses', 'messages', 'metrics', 'prompt', 'run_config', 'sdk', 'utils', 'validation']
