In [11]:
!pip install -U langchain langchain-community langchain-text-splitters faiss-cpu sentence-transformers rank-bm25 transformers accelerate




In [2]:
with open("knowledge.txt", "w") as f:
    f.write("""
Artificial Intelligence enables machines to mimic human intelligence.
Machine learning is a subset of AI.
Deep learning uses neural networks.
Hybrid search combines dense and sparse retrieval.
Reranking improves search result accuracy.
Vector databases store embeddings.
""")


In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("knowledge.txt")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)

docs = splitter.split_documents(documents)

print("Chunks:", len(docs))


Chunks: 1


In [4]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = FAISS.from_documents(docs, embeddings)


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]



config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
from rank_bm25 import BM25Okapi

corpus = [doc.page_content.split() for doc in docs]
bm25 = BM25Okapi(corpus)


In [5]:
pip install rank-bm25




HYBRID SEARCH( It combines both sematic and Bm25)

In [7]:
import numpy as np

def hybrid_search(query, top_k=5):

    # Dense search
    dense_docs = vectorstore.similarity_search(query, k=top_k)

    # Sparse search
    tokenized_query = query.split()
    sparse_scores = bm25.get_scores(tokenized_query)
    top_sparse_indices = np.argsort(sparse_scores)[-top_k:]
    sparse_docs = [docs[i] for i in top_sparse_indices]

    # Merge results (remove duplicates)
    merged = {doc.page_content: doc for doc in dense_docs + sparse_docs}

    return list(merged.values())


In [9]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder(
    "cross-encoder/ms-marco-MiniLM-L-6-v2",
)

def rerank(query, candidates, top_k=3):

    pairs = [(query, doc.page_content) for doc in candidates]
    scores = reranker.predict(pairs)

    ranked = sorted(
        zip(candidates, scores),
        key=lambda x: x[1],
        reverse=True
    )

    return [doc for doc, score in ranked[:top_k]]


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [10]:
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

hf_pipeline = pipeline(
    "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_new_tokens=150,
    temperature=0.0,
    do_sample=False,
    device_map="auto"
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'temperature', 'max_new_tokens', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [12]:
def advanced_rag(query):

    # Hybrid retrieval
    candidates = hybrid_search(query, top_k=5)

    # Rerank
    top_docs = rerank(query, candidates, top_k=3)

    # Build context
    context = "\n\n".join([doc.page_content for doc in top_docs])

    prompt = f"""
Use the context below to answer the question accurately.

Context:
{context}

Question:
{query}

Answer:
"""

    return llm.invoke(prompt)


In [13]:
print(advanced_rag("What is hybrid search?"))


Both `max_new_tokens` (=150) and `max_length`(=2048) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Use the context below to answer the question accurately.

Context:
Artificial Intelligence enables machines to mimic human intelligence.
Machine learning is a subset of AI.
Deep learning uses neural networks.
Hybrid search combines dense and sparse retrieval.
Reranking improves search result accuracy.
Vector databases store embeddings.

Question:
What is hybrid search?

Answer:
Hybrid search combines dense and sparse retrieval. It is a technique that uses both dense and sparse search to improve search result accuracy.

References:
1. "Hybrid Search: A New Approach to Search Queries," by Y. Zhang, Y. Liu, and Y. Zhang, 2019.
2. "Hybrid Search: A New Approach to Search Queries," by Y. Zhang, Y. Liu, and Y. Zhang, 2019.
3. "Hybrid Search: A New Approach to Search Queries," by Y. Zhang, Y. Liu, and Y. Zhang, 20
