<a href="https://colab.research.google.com/github/ShayanPervez/Advanced-RAG/blob/main/Reranking_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]


In [81]:
!pip install -U sentence-transformers



In [82]:
from sentence_transformers import SentenceTransformer

In [83]:
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"

In [84]:
model = SentenceTransformer(model_name)

In [85]:
documents

['This is a list which containing sample documents.',
 'Keywords are important for keyword-based search.',
 'Document analysis involves extracting keywords.',
 'Keyword-based search relies on sparse embeddings.',
 'Understanding document structure aids in keyword extraction.',
 'Efficient keyword extraction enhances search accuracy.',
 'Semantic similarity improves document retrieval performance.',
 'Machine learning algorithms can optimize keyword extraction methods.']

In [86]:
len(documents)

8

In [87]:
document_embedding = model.encode(documents)

In [88]:
for i,embedding in enumerate(document_embedding):
  print(f"Document {i+1} embedding shape: {embedding.shape}")

Document 1 embedding shape: (768,)
Document 2 embedding shape: (768,)
Document 3 embedding shape: (768,)
Document 4 embedding shape: (768,)
Document 5 embedding shape: (768,)
Document 6 embedding shape: (768,)
Document 7 embedding shape: (768,)
Document 8 embedding shape: (768,)


In [89]:
query = "Natural Language Processing techniques enhances keyword extraction and efficiency."

In [90]:
query_embeddings = model.encode(query)

In [91]:
print("Query Embedding:", query_embeddings)

Query Embedding: [ 3.23020160e-01  3.12376142e-01  4.37102430e-02  3.77759263e-02
  1.54213488e-01 -1.35263517e-01  8.95880386e-02  1.61216408e-01
  3.48666191e-01  5.87312393e-02 -6.75413758e-02 -1.28856614e-01
  1.44728586e-01  9.94062603e-01 -1.68053105e-01 -9.35827136e-01
 -2.84742028e-01 -2.01821804e-01  1.86429441e-01 -1.95479020e-02
 -4.59007248e-02 -1.05012804e-01 -9.04946849e-02 -2.62256682e-01
  2.69456118e-01  3.36192995e-02  1.91353828e-01  2.86583841e-01
 -4.35097665e-01  1.32349402e-01 -2.19004855e-01  1.61913671e-02
  6.80357665e-02  1.21774286e-01 -7.54485577e-02 -6.96404055e-02
  1.13969639e-01 -2.22408593e-01 -4.07918155e-01  7.65999139e-05
  2.44360827e-02  4.76954132e-01  9.51557308e-02  4.23957705e-01
 -1.77598089e-01 -2.36993372e-01 -6.74693286e-01 -4.26610172e-01
  2.82047063e-01  1.35397509e-01 -1.31024078e-01  7.56369606e-02
  1.34787753e-01  8.65256116e-02  1.92143023e-02  2.89181739e-01
  2.54056212e-02  3.12729627e-01  2.35053644e-01 -1.98653266e-01
 -2.1399

In [92]:
query_embeddings.shape

(768,)

In [93]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
similarities = cosine_similarity(np.array([query_embeddings]),document_embedding)

In [95]:
similarities

array([[0.17273088, 0.43841004, 0.5416846 , 0.41144696, 0.6171361 ,
        0.7287992 , 0.55241686, 0.7289753 ]], dtype=float32)

In [96]:
most_similar_index = np.argmax(similarities)

In [97]:
most_similar_index

np.int64(7)

In [98]:
most_similar_document = documents[most_similar_index]

In [99]:
most_similar_document

'Machine learning algorithms can optimize keyword extraction methods.'

In [100]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

In [101]:
similarity_score = similarities[0][most_similar_index]

In [102]:
similarity_score

np.float32(0.7289753)

In [103]:
sorted_indices = np.argsort(similarities[0])[::-1]

In [104]:
sorted_indices

array([7, 5, 4, 6, 2, 1, 3, 0])

In [105]:
ranked_document = [(documents[i], similarities[0][i]) for i in sorted_indices]

In [106]:
ranked_document

[('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.7289753)),
 ('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.7287992)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.6171361)),
 ('Semantic similarity improves document retrieval performance.',
  np.float32(0.55241686)),
 ('Document analysis involves extracting keywords.', np.float32(0.5416846)),
 ('Keywords are important for keyword-based search.', np.float32(0.43841004)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.41144696)),
 ('This is a list which containing sample documents.', np.float32(0.17273088))]

In [107]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

In [108]:
print("Top 4 document")
for rank, (document, similarity) in enumerate(ranked_document[:4], start=1):
    print(f"Rank {rank}:")
    print(f"Document: {document}")
    print(f"Similarity Score: {similarity}\n")

Top 4 document
Rank 1:
Document: Machine learning algorithms can optimize keyword extraction methods.
Similarity Score: 0.7289752960205078

Rank 2:
Document: Efficient keyword extraction enhances search accuracy.
Similarity Score: 0.7287992238998413

Rank 3:
Document: Understanding document structure aids in keyword extraction.
Similarity Score: 0.6171361207962036

Rank 4:
Document: Semantic similarity improves document retrieval performance.
Similarity Score: 0.5524168610572815



In [109]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

#BM25

In [110]:
!pip install rank_bm25



In [111]:
from rank_bm25 import BM25Okapi

In [112]:
top_4_documents = [doc[0] for doc in ranked_document[:4]]

In [113]:
top_4_documents

['Machine learning algorithms can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understanding document structure aids in keyword extraction.',
 'Semantic similarity improves document retrieval performance.']

In [114]:
tokenized_top_4_document = [doc.split() for doc in top_4_documents]

In [115]:
tokenized_top_4_document

[['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Semantic',
  'similarity',
  'improves',
  'document',
  'retrieval',
  'performance.']]

In [116]:
tokenized_query = query.split()

In [117]:
tokenized_query

['Natural',
 'Language',
 'Processing',
 'techniques',
 'enhances',
 'keyword',
 'extraction',
 'and',
 'efficiency.']

In [118]:
bm25 = BM25Okapi(tokenized_top_4_document)

In [119]:
bm25

<rank_bm25.BM25Okapi at 0x7e12f1961990>

In [120]:
bm25_scores = bm25.get_scores(tokenized_query)

In [121]:
bm25_scores

array([0.16152501, 1.07608767, 0.17211681, 0.        ])

In [122]:
sorted_indices = np.argsort(bm25_scores)[::-1]

In [123]:
sorted_indices

array([1, 2, 0, 3])

In [124]:
top_4_documents

['Machine learning algorithms can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understanding document structure aids in keyword extraction.',
 'Semantic similarity improves document retrieval performance.']

In [125]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

In [126]:
reranked_document = [(top_4_documents[i],bm25_scores[i])for i in sorted_indices]

In [127]:
reranked_document

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(1.0760876716588057)),
 ('Understanding document structure aids in keyword extraction.',
  np.float64(0.1721168141199951)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float64(0.16152501017414925)),
 ('Semantic similarity improves document retrieval performance.',
  np.float64(0.0))]

In [128]:
ranked_document[:4]

[('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.7289753)),
 ('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.7287992)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.6171361)),
 ('Semantic similarity improves document retrieval performance.',
  np.float32(0.55241686))]

#Cross Encoder

In [129]:
from sentence_transformers import CrossEncoder

In [130]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [131]:
top_4_documents

['Machine learning algorithms can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understanding document structure aids in keyword extraction.',
 'Semantic similarity improves document retrieval performance.']

In [132]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

In [133]:
pairs = []
for doc in top_4_documents:
  pairs.append([query,doc])

In [134]:
pairs

[['Natural Language Processing techniques enhances keyword extraction and efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural Language Processing techniques enhances keyword extraction and efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural Language Processing techniques enhances keyword extraction and efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural Language Processing techniques enhances keyword extraction and efficiency.',
  'Semantic similarity improves document retrieval performance.']]

In [135]:
scores = cross_encoder.predict(pairs)

In [136]:
scores

array([ 0.7302185,  3.808076 , -2.275875 , -8.559866 ], dtype=float32)

In [137]:
scored_docs = zip(scores,top_4_documents)
scored_docs

<zip at 0x7e12ebe5e540>

In [138]:
reranked_docs_cross_encoder = sorted(scored_docs,reverse=True)

In [139]:
reranked_docs_cross_encoder

[(np.float32(3.808076),
  'Efficient keyword extraction enhances search accuracy.'),
 (np.float32(0.7302185),
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (np.float32(-2.275875),
  'Understanding document structure aids in keyword extraction.'),
 (np.float32(-8.559866),
  'Semantic similarity improves document retrieval performance.')]

In [140]:
reranked_document

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(1.0760876716588057)),
 ('Understanding document structure aids in keyword extraction.',
  np.float64(0.1721168141199951)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float64(0.16152501017414925)),
 ('Semantic similarity improves document retrieval performance.',
  np.float64(0.0))]

#Cohere

In [141]:
!pip install cohere



In [142]:
import cohere

In [143]:
COHERE_API_KEY= "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [144]:
co = cohere.ClientV2(COHERE_API_KEY)

In [145]:
top_4_documents

['Machine learning algorithms can optimize keyword extraction methods.',
 'Efficient keyword extraction enhances search accuracy.',
 'Understanding document structure aids in keyword extraction.',
 'Semantic similarity improves document retrieval performance.']

In [146]:
query

'Natural Language Processing techniques enhances keyword extraction and efficiency.'

In [147]:
response = co.rerank(
    model="rerank-v3.5",
    query=query,
    documents=top_4_documents,
    return_documents=True
)

In [148]:
for result in response.results:
    print(result)

document=V2RerankResponseResultsItemDocument(text='Efficient keyword extraction enhances search accuracy.') index=1 relevance_score=0.54942364
document=V2RerankResponseResultsItemDocument(text='Machine learning algorithms can optimize keyword extraction methods.') index=0 relevance_score=0.528315
document=V2RerankResponseResultsItemDocument(text='Understanding document structure aids in keyword extraction.') index=2 relevance_score=0.41137156
document=V2RerankResponseResultsItemDocument(text='Semantic similarity improves document retrieval performance.') index=3 relevance_score=0.18172778


#BERT

In [149]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [150]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [151]:
sentence_a = "The movie was fantastic."
sentence_b = "I really enjoyed the film."

In [152]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', padding=True, truncation=True)

In [153]:
output = model(**inputs)
logits = output.logits

In [154]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2155,  0.3329]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [155]:
logits

tensor([[-0.2155,  0.3329]], grad_fn=<AddmmBackward0>)

In [156]:
probs = torch.softmax(logits, dim=1)

In [157]:
probs

tensor([[0.3663, 0.6337]], grad_fn=<SoftmaxBackward0>)

In [158]:
similarity_score = probs[0][1].item()
print(f"Similarity score is: {similarity_score}")

Similarity score is: 0.6337465643882751
