In [None]:
# Step 1: Import Required Libraries
import pandas as pd
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Load the Dataset (First 10,000 Rows)
# Replace 'metadata.csv' with your dataset file
df = pd.read_csv('metadata.csv', nrows=10000)  # Load only the first 10,000 rows


In [3]:
# Step 3: Preprocess the Data
df = df.dropna(subset=['abstract'])  # Drop rows with missing abstracts

In [4]:
# Step 4: Prepare Documents for RAG
documents = []
for idx, row in df.iterrows():
    documents.append({
        "content": row['abstract'][:500],  # Truncate abstract to 500 characters
        "meta": {
            "title": row['title'],
            "authors": row['authors'],
            "publish_time": row['publish_time'],
            "source": row['source_x']
        }
    })

In [5]:
# Step 5: Initialize the RAG Pipeline with FAISSDocumentStore
# Set embedding_dim=384 to match the model's output
document_store = FAISSDocumentStore(
    sql_url="sqlite:///faiss_document_store.db",
    faiss_index_factory_str="Flat",
    embedding_dim=384  # Set embedding dimension to 384
)

# Write documents to the document store
document_store.write_documents(documents)


Writing Documents: 10000it [00:48, 207.09it/s]             


In [6]:
# Step 6: Initialize the Retriever (Embedding-based)
# Use a lightweight embedding model for semantic search
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-MiniLM-L6-v2"  # Lightweight and effective model
)

In [7]:
# Update embeddings in the document store
document_store.update_embeddings(retriever)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.95it/s]
Batches: 100%|██████████| 245/245 [14:55<00:00,  3.66s/it]cs/s]
Documents Processed: 10000 docs [15:15, 10.92 docs/s]           


In [8]:
# Step 7: Initialize the Reader (for extractive QA)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

In [9]:
# Step 8: Initialize the Extractive QA Pipeline
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)


In [10]:
# Step 9: Ask a Question
question = "What is the impact of climate change on biodiversity?"
results = pipeline.run(query=question, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}})


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.77it/s]
Inferencing Samples: 100%|██████████| 1/1 [00:05<00:00,  5.65s/ Batches]


In [11]:
# Step 10: Display Results
print(f"Question: {question}")
for idx, answer in enumerate(results["answers"]):
    print(f"\nAnswer {idx + 1}:")
    print(f"  - Answer: {answer.answer}")
    print(f"  - Confidence: {answer.score:.4f}")
    print(f"  - Context: {answer.context}")
    print(f"  - Metadata: {answer.meta}")

Question: What is the impact of climate change on biodiversity?

Answer 1:
  - Answer: degrade
  - Confidence: 0.6850
  - Context: s clear evidence that the Earth’s ecosystems and landscapes continue to degrade as a consequence of the cumulative impact of human activities. Taking 
  - Metadata: {'title': 'Sustainability science: an ecohealth perspective', 'authors': 'Rapport, David J.', 'publish_time': '2006-12-14', 'source': 'PMC', 'vector_id': '34'}

Answer 2:
  - Answer: increase agricultural yields
  - Confidence: 0.3483
  - Context: oductivity in natural ecosystems. Biodiversity effects might increase agricultural yields at no cost in additional inputs. However, the effects of div
  - Metadata: {'title': 'Plant domestication disrupts biodiversity effects across major crop types', 'authors': 'Chacón‐Labella, Julia; García Palacios, Pablo; Matesanz, Silvia; Schöb, Christian; Milla, Rubén', 'publish_time': '2019-07-03', 'source': 'PMC', 'vector_id': '5767'}

Answer 3:
  - Answer: hum