In [15]:
!pip install datasets

In [16]:
!pip install sentence_transformers

In [60]:
import pandas as pd

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split

## Load and Prepare Data

In [94]:
dataset = load_dataset("ms_marco",  "v1.1", trust_remote_code=True)

In [58]:
train_data = dataset['train'][:100]

In [56]:
print(train_data.keys())

dict_keys(['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])


In [78]:
queries = train_data['query']
passages = [entry['passage_text'] for entry in train_data['passages']]
answers = train_data['answers']

In [96]:
print(passages[0])

["Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.", "The Reserve Bank of Australia (RBA) came into being on 14 January 1960 as Australia 's central bank and banknote issuing authority, when the Reserve Bank Act 1959 removed the central banking functions from the Commonwealth Bank. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site.", 'RBA Rec

In [66]:
df = pd.DataFrame({
    'query': queries,
    'passage': passages,
    'answer': answers,
})

print(df.head())

                                               query  \
0                                        what is rba   
1                       was ronald reagan a democrat   
2  how long do you need for sydney and surroundin...   
3                    price to install tile in shower   
4                    why conversion observed in body   

                                             passage  \
0  [Since 2007, the RBA's outstanding reputation ...   
1  [In his younger years, Ronald Reagan was a mem...   
2  [Sydney, New South Wales, Australia is located...   
3  [In regards to tile installation costs, consum...   
4  [Conclusions: In adult body CT, dose to an org...   

                                              answer  
0  [Results-Based Accountability is a disciplined...  
1                                              [Yes]  
2                                    [20-25 minutes]  
3                       [$11 to $22 per square foot]  
4                      [Due to symptoms in the body

## Embed Text

In [67]:
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

In [69]:
query_embeddings = embedder.encode(queries, convert_to_tensor=True)
passage_embeddings = embedder.encode(passages, convert_to_tensor=True)

## Query based on queries in the dataset

In [73]:
hits = []
for query_embedding in query_embeddings:
    search_hits = util.semantic_search(query_embedding, passage_embeddings, top_k=5)
    hits.append(search_hits)

In [75]:
hits[0]

[[{'corpus_id': 0, 'score': 0.5272153615951538},
  {'corpus_id': 79, 'score': 0.146012544631958},
  {'corpus_id': 85, 'score': 0.138194277882576},
  {'corpus_id': 41, 'score': 0.13073547184467316},
  {'corpus_id': 92, 'score': 0.09723972529172897}]]

In [91]:
results_preview = []

for i, hit in enumerate(hits[:5]): # 5 Queries
    query_preview = queries[i]

    for result in hit[0][:2]:  # Top 2 results from the search
        corpus_id = result['corpus_id']  # Use corpus_id to get the passage index
        passage = passages[corpus_id]
        results_preview.append({
            'query': query_preview,
            'passage': passage,
            'score': result['score']
        })

df_results_preview = pd.DataFrame(results_preview)
df_results_preview.head(10)

Unnamed: 0,query,passage,score
0,what is rba,"[Since 2007, the RBA's outstanding reputation ...",0.527215
1,what is rba,[Create the yellow gold of alfalfa honey from ...,0.146013
2,was ronald reagan a democrat,"[In his younger years, Ronald Reagan was a mem...",0.711876
3,was ronald reagan a democrat,[Proposing an amendment to the Constitution of...,0.219083
4,how long do you need for sydney and surroundin...,"[Sydney, New South Wales, Australia is located...",0.475201
5,how long do you need for sydney and surroundin...,[The 14 teams participating in New Zealand's I...,0.265859
6,price to install tile in shower,"[In regards to tile installation costs, consum...",0.877027
7,price to install tile in shower,[Basic concrete slab cost. I'll base my price ...,0.381413
8,why conversion observed in body,"[Conclusions: In adult body CT, dose to an org...",0.406962
9,why conversion observed in body,[Gas exchange is the delivery of oxygen from t...,0.294063


## Inputing your own query

In [95]:
my_query = input('My Question: ')
my_query_embedding = embedder.encode([my_query])

my_hits = util.semantic_search(my_query_embedding, passage_embeddings, top_k=3)

my_preview = []

for result in my_hits[0][:3]:
    corpus_id = result['corpus_id']
    passage = passages[corpus_id]
    my_preview.append({
        'query': my_query,
        'passage': passage,
        'score': result['score']
    })

df_my_preview = pd.DataFrame(my_preview)
df_my_preview.head(10)

My Question: anything about girls


Unnamed: 0,query,passage,score
0,anything about girls,"[In English, the name Gayla means-festive part...",0.125701
1,anything about girls,"[©Constant Contact, Inc. All rights reserved. ...",0.12503
2,anything about girls,[Assault on a Police Officer. Assault on a pol...,0.113609
