<a href="https://colab.research.google.com/github/Samin-Sadaf7/Langchain_Fundamentals/blob/main/Reranking_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]


In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [3]:
from sentence_transformers import SentenceTransformer

In [4]:
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'

In [5]:
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
document_embeddings = model.encode(documents)

In [9]:
len(document_embeddings[0])

768

In [10]:
query = "Natural language processing techniques enhance keyword extraction efficiency."

In [11]:
query_embedding = model.encode(query)

In [13]:
len(query_embedding)

768

In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarities = cosine_similarity(np.array([query_embedding]), document_embeddings)

In [16]:
similarities

array([[0.16948149, 0.4580228 , 0.5675695 , 0.441233  , 0.6316118 ,
        0.75214136, 0.550352  , 0.74481666]], dtype=float32)

In [17]:
most_similar_index = np.argmax(similarities)

In [18]:
most_similar_index

5

In [19]:
most_similar_document = documents[most_similar_index]

In [20]:
most_similar_document

'Efficient keyword extraction enhances search accuracy.'

In [22]:
similarity_score = similarities[0][most_similar_index]

In [23]:
similarity_score

0.75214136

In [24]:
sorting_indices = np.argsort(similarities[0])[::-1]

In [25]:
sorting_indices

array([5, 7, 4, 2, 6, 1, 3, 0])

In [26]:
ranked_documents = [(documents[i], similarities[0][i])for i in sorting_indices]

In [27]:
ranked_documents

[('Efficient keyword extraction enhances search accuracy.', 0.75214136),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.74481666),
 ('Understanding document structure aids in keyword extraction.', 0.6316118),
 ('Document analysis involves extracting keywords.', 0.5675695),
 ('Semantic similarity improves document retrieval performance.', 0.550352),
 ('Keywords are important for keyword-based search.', 0.4580228),
 ('Keyword-based search relies on sparse embeddings.', 0.441233),
 ('This is a list which containing sample documents.', 0.16948149)]

In [28]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [29]:
from rank_bm25 import BM25Okapi

In [30]:
top_4_documents = [doc[0] for doc in ranked_documents[:4]]

In [31]:
tokenized_top_4_documents = [doc.split() for doc in top_4_documents]

In [32]:
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [33]:
tokenized_query = query.split()

In [34]:
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [35]:
bm25 =  BM25Okapi(tokenized_top_4_documents)

In [36]:
bm25

<rank_bm25.BM25Okapi at 0x7b68603c8eb0>

In [37]:
bm25_score = bm25.get_scores(tokenized_query)

In [38]:
bm25_score

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [39]:
sorted_indices2 = np.argsort(bm25_score)[::-1]

In [41]:
reranked_documents = [(top_4_documents[i], bm25_score[i]) for i in sorted_indices2]

In [43]:
print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0


In [44]:
print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521413564682007
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448166608810425
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.6316118240356445
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675694942474365
Rank 5: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.5503519773483276
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.458022803068161
Rank 7: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.4412330090999603
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.16948148608207703


In [45]:
from sentence_transformers import CrossEncoder

In [46]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [47]:
pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])

In [48]:
pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [49]:
scores = cross_encoder.predict(pairs)
scores

array([ 3.137871 ,  0.8421656, -2.9193   , -2.8781917], dtype=float32)

In [50]:
scored_docs = zip(scores, top_4_documents)

In [51]:
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)

In [52]:
reranked_document_cross_encoder

[(3.137871, 'Efficient keyword extraction enhances search accuracy.'),
 (0.8421656,
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (-2.8781917, 'Document analysis involves extracting keywords.'),
 (-2.9193, 'Understanding document structure aids in keyword extraction.')]