In [None]:
!pip install datasets

In [None]:
!pip install sentence_transformers

In [94]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load MS_MARCO Dataset

In [4]:
dataset = load_dataset("ms_marco",  "v1.1", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

In [142]:
train_data = dataset['train'][:100]

In [143]:
print(train_data.keys())

dict_keys(['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])


In [156]:
queries = train_data['query']
passages = [entry['passage_text'] for entry in train_data['passages']]
answers = train_data['answers']

### Preview Data

In [169]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.colheader_justify', 'left')

df = pd.DataFrame({
    'query': queries,
    'passage': passages,
    'answer': answers,
})

print(df.head())

  query                                                   \
0                                            what is rba   
1                           was ronald reagan a democrat   
2  how long do you need for sydney and surrounding areas   
3                        price to install tile in shower   
4                        why conversion observed in body   

  passage                                                                                               \
0  [Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scanda...   
1  [In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Dem...   
2  [Sydney, New South Wales, Australia is located in a coastal basin bordered by the Pacific Ocean ...   
3  [In regards to tile installation costs, consumers can expect to pay an average of $25 per square...   
4  [Conclusions: In adult body CT, dose to an organ fully encompassed by the primary radiation beam...   

  

## Embedder

In [146]:
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

### Embed all passages

In [147]:
passage_embeddings = embedder.encode(passages, convert_to_tensor=True)

## Function to Extract the 3 most relevant sentences from the passage

In [181]:
def extract_relevant_sentence(query_embedding, passage, top_k=3):

    # Split passage into sentences
    sentences = sent_tokenize(passage)

    sentence_embeddings = embedder.encode(sentences)

    # Compute similarity
    similarities = util.semantic_search(query_embedding, sentence_embeddings)[0]

    # Get indices of sentence with top k highest relevance then get sentences
    top_sentence_indices = similarities.argsort(descending=True)[:top_k].tolist()
    relevant_sentences = [sentences[i] for i in top_sentence_indices]

    return " ".join(relevant_sentences)

## Set up Summarizer

In [149]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [150]:
def summarize_text(text, max_length, min_length):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) # Deterministic not Random Output
    return summary[0]['summary_text']

## Query based on the queries that came with the dataset

In [171]:
queries_to_use = queries[0:5]

In [175]:
queries_to_use = random.sample(queries, 5)

In [177]:
preview = []
queries_embeddings = embedder.encode(queries_to_use, convert_to_tensor=True)
hit = util.semantic_search(queries_embeddings, passage_embeddings, top_k=1)

for i in range(len(queries_to_use)):
    query = queries_to_use[i]
    query_embedding = queries_embeddings[i]  # Get the embedding for the current query

    # Get most relevant passage
    result = hit[i][0]
    corpus_id = result['corpus_id']  # Use corpus_id to find the  passage
    passage = ' '.join(passages[corpus_id])

    # Extract the 3 most relevant sentences from the passage
    best_sentences = extract_relevant_sentence(query_embedding, passage, top_k=3)

    # Summarize the best sentences
    summary = summarizer(best_sentences, max_length=50, min_length=10)

    # Store the preview data for this query
    preview.append({
        "query": query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

Your max_length is set to 50, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 50, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


In [178]:
# Remove key 'summary_text'
for entry in preview:
    entry['summary'] = entry['summary'][0]['summary_text']

In [179]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'left')

df_preview = pd.DataFrame(preview)
df_preview_styled = df_preview.style.set_properties(**{'text-align': 'left'})

df_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,what is a goods receipt,"A Goods Receipt is a document issued to acknowledge the receipt of the items listed in it. A goods receipt is the physical inbound movement of goods or materials into the warehouse. Select language: Goods Receipt Use With the goods receipt (GR) you post the physical inward movement of goods from an external vendor or from production and then complete a goods movement, which leads to an increase in the warehouse stock.",A Goods Receipt is a document issued to acknowledge the receipt of the items listed in it. A goods receipt is the physical inbound movement of goods or materials into the warehouse.,0.710822
1,what is kuchen,Kuchen. Kuchen. Kuchen is a tasty dessert with a dough crust and custard filling.,Kuchen is a tasty dessert with a dough crust and custard filling.,0.668515
2,types of bacterial respiration,The respiration in bacteria is basically of two types aerobic and anaerobic which may be obligate or facultative. The two primary methods of bacterial respiration are aerobic respiration and anaerobic respiration. Some species of bacteria go through the process of cellular respiration.,The respiration in bacteria is basically of two types aerobic and anaerobic which may be obligate or facultative. Some species of bacteria go through the process of cellular respiration. The two primary methods of bacterial respiration are aerobic,0.736945
3,how long is german measles contagious,"A: The German measles are contagious for 7 days before to 7 days after the rash appears, as noted by the New York State Department of Health. The German measles are contagious for 7 days before to 7 days after the rash appears, as noted by the New York State Department of Health. German measles (rubella) is caused by a highly contagious virus.","The German measles are contagious for 7 days before to 7 days after the rash appears, as noted by the New York State Department of Health.",0.73925
4,gayla name meaning,"English Meaning: The name Gayla is an English baby name. Meaning of Gayla. In English, the name Gayla means-festive party.The name Gayla originated as an English name.","Gayla is an English baby name. In English, the name Gayla means-festive party.",0.886448


## Input your own query

In [180]:
my_query = input("My Question: ")
my_query_embedding = embedder.encode([my_query])

my_hits = util.semantic_search(my_query_embedding, passage_embeddings, top_k=1)

my_preview = []

for result in my_hits[0]: # Most relevant passage
    corpus_id = result["corpus_id"] # get index of the most relevant passage
    passage = " ".join(passages[corpus_id])

    # Extract the 3 most relevant sentences
    best_sentences = extract_relevant_sentence(my_query_embedding, passage, top_k=3)

    # Summarize
    summary = summarize_text(best_sentences, max_length=50, min_length=10)

    my_preview.append({
        "query": my_query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

My Question: How to be a good rainbow


In [184]:
df_my_preview = pd.DataFrame(my_preview)
df_my_preview_styled = df_my_preview.style.set_properties(**{'text-align': 'left'})

df_my_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,How to be a good rainbow,"Add a tiny amount of ultramarine blue to darken the hue while maintaining a light overall orange gold tone. You could even try using yellowie gold and adding gold sparkles (found in many craft stores) to give it the glint of gold. To create gold in watercolor, use yellow ochre for the highlighted sections and mix the yellow with raw sienna and carmine for the shadows.",Use yellow ochre for highlighted sections and mix the yellow with raw sienna and carmine for the shadows. Add a tiny amount of ultramarine blue to darken the hue while maintaining a light overall orange gold tone.,0.354126
