In [None]:
!pip install datasets

In [None]:
!pip install sentence_transformers

In [24]:
import random
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, CrossEncoder

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load MS_MARCO Dataset

In [25]:
dataset = load_dataset("ms_marco",  "v1.1", trust_remote_code=True)

In [26]:
train_data = dataset['train'][:100]

In [27]:
print(train_data.keys())

dict_keys(['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])


In [28]:
queries = train_data['query']
passages = [entry['passage_text'] for entry in train_data['passages']]
answers = train_data['answers']

### Preview Data

In [29]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.colheader_justify', 'left')

df = pd.DataFrame({
    'query': queries,
    'passage': passages,
    'answer': answers,
})

print(df.head())

  query                                                   \
0                                            what is rba   
1                           was ronald reagan a democrat   
2  how long do you need for sydney and surrounding areas   
3                        price to install tile in shower   
4                        why conversion observed in body   

  passage                                                                                               \
0  [Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scanda...   
1  [In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Dem...   
2  [Sydney, New South Wales, Australia is located in a coastal basin bordered by the Pacific Ocean ...   
3  [In regards to tile installation costs, consumers can expect to pay an average of $25 per square...   
4  [Conclusions: In adult body CT, dose to an organ fully encompassed by the primary radiation beam...   

  

## Query Preparation

In [30]:
def clean_query(query):
    # Remove special characters and digits, keep only letters and spaces
    cleaned_query = re.sub(r'[^a-zA-Z\s]', '', query)

    cleaned_query = cleaned_query.lower()

    return cleaned_query

## Answer Preparation

### Embedder

In [31]:
model_name = 'all-mpnet-base-v2'
embedder = SentenceTransformer(model_name)

##### Embed all passages

In [32]:
passage_embeddings = embedder.encode(passages, convert_to_tensor=True)

### Function to Extract the 3 most relevant sentences from the passage

In [33]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [34]:
def extract_relevant_sentence(query, passage, top_k=3):
    # Split passage into sentences
    sentences = sent_tokenize(passage)

    # Create sentence-query pairs
    sentence_query_pairs = [(query, sentence) for sentence in sentences]

    # Rank sentences using the Cross Encoder
    scores = cross_encoder.predict(sentence_query_pairs)

    # Get indices of sentences with top-k highest relevance (based on cross-encoder scores)
    top_sentence_indices = np.argsort(scores)[::-1][:top_k]

    # Get the most relevant sentences based on the top indices
    relevant_sentences = [sentences[i] for i in top_sentence_indices]

    return " ".join(relevant_sentences)

### Set up Summarizer

In [35]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [36]:
def summarize_text(text, max_length, min_length):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) # Deterministic not Random Output
    return summary[0]['summary_text']

## Testing the Model

### Query based on the queries that came with the dataset

In [37]:
queries_to_use = random.sample(queries, 5)
print(queries_to_use)

['How much will it cost to go to college to become a detective', 'cal water phone number torrance CA', 'where are the lungs located in the back', 'describe the pathway of a nerve impulse through a reflex arc', 'what is conduction']


In [38]:
preview = []
queries_embeddings = embedder.encode(queries_to_use, convert_to_tensor=True)
hit = util.semantic_search(queries_embeddings, passage_embeddings, top_k=1)

for i in range(len(queries_to_use)):
    query = clean_query(queries_to_use[i])

    # Get most relevant passage
    result = hit[i][0]
    corpus_id = result['corpus_id']  # Use corpus_id to find the passage
    passage = ' '.join(passages[corpus_id])

    # Extract the 3 most relevant sentences from the passage
    best_sentences = extract_relevant_sentence(query, passage, top_k=3)

    # Summarize the best sentences
    summary = summarizer(best_sentences, max_length=50, min_length=10)

    # Store the preview data for this query
    preview.append({
        "query": query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

In [39]:
# Remove key 'summary_text'
for entry in preview:
    entry['summary'] = entry['summary'][0]['summary_text']

In [40]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'left')

df_preview = pd.DataFrame(preview)
df_preview_styled = df_preview.style.set_properties(**{'text-align': 'left'})

df_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,how much will it cost to go to college to become a detective,"At $80 per community college credit, a criminal justice degree costs about $2,500 each year. A: The degree that you need to be a detective can be a minimum of a high school diploma, GED or, preferably, an associate's degree in a subject related to law, criminal justice, forensics, crime scene investigation or legal studies. They only plan to hire 55 If you choose to get your criminal justice degree at a community college, you’ll pay about $80 per credit.","At $80 per community college credit, a criminal justice degree costs about $2,500 each year. The degree that you need to be a detective can be a minimum of a high school diploma, GED or, preferably, an",0.603885
1,cal water phone number torrance ca,"Torrance Municipal Water Utility can be reached at 1-855-354-5623, and California Water Service Company can be contacted at 1-310-257-1400. For after-hours emergencies, please call Torrance Public Safety at 1-310-328-3456. Torrance is a city in the South Bay (southwestern) region of Los Angeles County, California, United States.","Torrance is a city in the South Bay (southwestern) region of Los Angeles County, California, United States. Torrance Municipal Water Utility can be reached at 1-855-354-5623.",0.581574
2,where are the lungs located in the back,"2 Where are the lungs located in the back. In humans, the lungs are located on either side of the heart in the chest, with the left lung sharing the left side of the space with the heart, which sits in an impression called the cardiac notch. The lungs are a pair of spongy, air-filled organs located on either side of the chest (thorax).","The lungs are a pair of spongy, air-filled organs located on either side of the chest (thorax) In humans, the lungs are located onEither side of a heart in the chest. The left lung shares",0.70743
3,describe the pathway of a nerve impulse through a reflex arc,"the circuit traveled by impulses producing a reflex action: from the receptor organ, through the afferent nerve, nerve center, efferent nerve, A reflex arc is a neural pathway that controls a reflex action. The nerve impulse travels through the reflex arc. Reflex arc is the path of impulse that travels from receptor (ex.",A reflex arc is a neural pathway that controls a reflex action. The nerve impulse travels through the reflex arc. Reflex arc is the path of impulse that travels from receptor to receptor.,0.825541
4,what is conduction,Conduction is the transfer of energy in the form of heat or electricity from one atom to another within an object by direct contact. Conduction is the transfer of heat from one molecule to another through a substance. Conduction is the transfer of energy through matter from particle to particle.,Conduction is the transfer of energy in the form of heat or electricity from one atom to another within an object by direct contact.,0.619968


### Input your own query

In [43]:
my_query = clean_query(input("My Question: "))
my_query_embedding = embedder.encode([my_query], convert_to_tensor=True)
my_hits = util.semantic_search(my_query_embedding, passage_embeddings, top_k=1)

my_preview = []

for result in my_hits[0]: # Most relevant passage
    corpus_id = result["corpus_id"]
    passage = " ".join(passages[corpus_id])

    # Extract the 3 most relevant sentences
    best_sentences = extract_relevant_sentence(my_query, passage, top_k=3)

    # Summarize the best sentences
    summary = summarize_text(best_sentences, max_length=50, min_length=10)

    my_preview.append({
        "query": my_query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

My Question: Should I get vaccinated?


In [44]:
df_my_preview = pd.DataFrame(my_preview)
df_my_preview_styled = df_my_preview.style.set_properties(**{'text-align': 'left'})

df_my_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,should i get vaccinated,"For most patients, vaccination is a safe and effective way to prevent German measles (rubella). The German measles vaccine is typically combined with vaccines for the measles and mumps, as well as varicella, the virus that causes chicken pox. Quick Answer.","For most patients, vaccination is a safe and effective way to prevent German measles. The German measles vaccine is typically combined with vaccines for the measles and mumps, as well as varicella.",0.228706
