In [1]:
!pip install datasets
!pip install langchain
!pip install llama-index
!pip install openai
!pip install faiss-cpu
!pip install -U langchain-openai
!pip install -U langchain-community
!pip install unstructured

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [2]:
import os
from google.colab import userdata

In [3]:
import time
import pandas as pd
from datasets import load_dataset

# from langchain.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings
# from langchain_openai import OpenAI
# from langchain.chains import ConversationalRetrievalChain
# from langchain.prompts import PromptTemplate
# from IPython.display import display, Markdown
# from langchain_openai import ChatOpenAI


from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

from llama_index.core.indices.vector_store.base import VectorStoreIndex

# from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display
# from llama_index.query_engine import RetrieverQueryEngine

In [4]:
os.environ["OPENAI_API_KEY"] = userdata.get("API_KEY")

In [5]:
dataset = load_dataset("squad_v2", split="validation[:50]")  # Adjust range as needed

os.makedirs("squad_docs", exist_ok=True)
filtered_entries = []

for i, row in enumerate(dataset):
    context = row['context']
    question = row['question']
    answer = row['answers']['text'][0] if row['answers']['text'] else ""

    with open(f"squad_docs/doc_{i}.txt", "w") as f:
        f.write(context)

    filtered_entries.append({
        "question": question,
        "context": context,
        "answer": answer
    })

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [6]:
pd.DataFrame(filtered_entries).to_csv("squad_filtered_entries.csv", index=False)

In [7]:
filtered_entries

[{'question': 'In what country is Normandy located?',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
  'answer': 'France'},
 {'question': 'When were the Normans in Normandy?',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th

### LangChain Setup

In [8]:
df = pd.read_csv("squad_filtered_entries.csv")

In [9]:
!ls

sample_data  squad_docs  squad_filtered_entries.csv


In [17]:
from langchain.document_loaders import DirectoryLoader
import os
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown
from langchain_openai import ChatOpenAI

loader = DirectoryLoader("squad_docs", glob="**/*.txt")
documents = loader.load()
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(documents)

embedding_model = OpenAIEmbeddings()
faiss_index = FAISS.from_documents(docs, embedding_model)

llm = ChatOpenAI(
    temperature=0.3,
    model_name="gpt-4.1",
    max_tokens=1024
)

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=faiss_index.as_retriever())

In [18]:
# from llama_index import ServiceContext

reader = SimpleDirectoryReader("squad_docs")
documents_li = reader.load_data()
# service_context = ServiceContext.from_defaults(llm=llm)
index_li = VectorStoreIndex.from_documents(documents_li)
llama_query_engine = index_li.as_query_engine()

In [19]:
def exact_match(pred, gold):
    if not pred or not gold:
        return 0
    return int(str(pred).strip().lower() == str(gold).strip().lower())

In [21]:
import re
from collections import Counter

def f1_score(prediction, ground_truth):
    if not prediction or not ground_truth:
        return 0.0

    def normalize_answer(s):
        s = str(s).lower()
        s = re.sub(r'\b(a|an|the)\b', ' ', s)
        s = re.sub(r'[^a-z0-9\s]', '', s)
        return ' '.join(s.split())

    pred_tokens = normalize_answer(prediction).split()
    gold_tokens = normalize_answer(ground_truth).split()
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())

    if not pred_tokens or not gold_tokens:
        return int(pred_tokens == gold_tokens)
    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * (precision * recall) / (precision + recall)



In [22]:
results = []

for i in range(30):
    query = df.loc[i, 'question']
    ground_truth = df.loc[i, 'answer']

    if not isinstance(ground_truth, str) or ground_truth.strip() == "":
        print(f"Skipping entry {i} due to empty ground truth.")
        continue

    # LangChain
    t0 = time.time()
    answer_lc = qa_chain.run(query)
    t1 = time.time()

    # LlamaIndex
    answer_li = llama_query_engine.query(query)
    t2 = time.time()

    results.append({
        "Query": query,
        "Ground Truth": ground_truth,

        "LangChain Answer": answer_lc,
        "LangChain Time (s)": round(t1 - t0, 2),
        # "LangChain EM": exact_match(answer_lc, ground_truth),
        "LangChain F1": round(f1_score(answer_lc, ground_truth), 2),

        "LlamaIndex Answer": str(answer_li),
        "LlamaIndex Time (s)": round(t2 - t1, 2),
        # "LlamaIndex EM": exact_match(str(answer_li), ground_truth),
        "LlamaIndex F1": round(f1_score(str(answer_li), ground_truth), 2),
    })


Skipping entry 5 due to empty ground truth.
Skipping entry 6 due to empty ground truth.
Skipping entry 7 due to empty ground truth.
Skipping entry 8 due to empty ground truth.
Skipping entry 12 due to empty ground truth.
Skipping entry 13 due to empty ground truth.
Skipping entry 14 due to empty ground truth.
Skipping entry 15 due to empty ground truth.
Skipping entry 16 due to empty ground truth.
Skipping entry 19 due to empty ground truth.
Skipping entry 20 due to empty ground truth.
Skipping entry 24 due to empty ground truth.
Skipping entry 25 due to empty ground truth.
Skipping entry 26 due to empty ground truth.
Skipping entry 27 due to empty ground truth.
Skipping entry 29 due to empty ground truth.


In [23]:
pd.DataFrame(results).to_csv("rag_comparison_results.csv", index=False)