In [None]:
!pip install datasets

In [None]:
!pip install sentence_transformers

In [123]:
import random
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from transformers import pipeline
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, CrossEncoder

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load MS_MARCO Dataset

In [64]:
dataset = load_dataset("ms_marco",  "v1.1", trust_remote_code=True)

In [None]:
train_data = dataset['train'][:100]

In [None]:
print(train_data.keys())

dict_keys(['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])


In [None]:
queries = train_data['query']
passages = [entry['passage_text'] for entry in train_data['passages']]
answers = train_data['answers']

### Preview Data

In [122]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.colheader_justify', 'left')

df = pd.DataFrame({
    'query': queries,
    'passage': passages,
    'answer': answers,
})

print(df.head())

  query                                                   \
0                                            what is rba   
1                           was ronald reagan a democrat   
2  how long do you need for sydney and surrounding areas   
3                        price to install tile in shower   
4                        why conversion observed in body   

  passage                                                                                               \
0  [Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scanda...   
1  [In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Dem...   
2  [Sydney, New South Wales, Australia is located in a coastal basin bordered by the Pacific Ocean ...   
3  [In regards to tile installation costs, consumers can expect to pay an average of $25 per square...   
4  [Conclusions: In adult body CT, dose to an organ fully encompassed by the primary radiation beam...   

  

## Query Preparation

In [88]:
def clean_query(query):
    # Remove special characters and digits, keep only letters and spaces
    cleaned_query = re.sub(r'[^a-zA-Z\s]', '', query)

    cleaned_query = cleaned_query.lower()

    return cleaned_query

## Answer Preparation

### Embedder

In [None]:
model_name = 'all-mpnet-base-v2'
embedder = SentenceTransformer(model_name)

##### Embed all passages

In [60]:
passage_embeddings = embedder.encode(passages, convert_to_tensor=True)

### Function to Extract the 3 most relevant sentences from the passage

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [81]:
def extract_relevant_sentence(query, passage, top_k=3):
    # Split passage into sentences
    sentences = sent_tokenize(passage)

    # Create sentence-query pairs
    sentence_query_pairs = [(query, sentence) for sentence in sentences]

    # Rank sentences using the Cross Encoder
    scores = cross_encoder.predict(sentence_query_pairs)

    # Get indices of sentences with top-k highest relevance (based on cross-encoder scores)
    top_sentence_indices = np.argsort(scores)[::-1][:top_k]

    # Get the most relevant sentences based on the top indices
    relevant_sentences = [sentences[i] for i in top_sentence_indices]

    return " ".join(relevant_sentences)

### Set up Summarizer

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def summarize_text(text, max_length, min_length):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) # Deterministic not Random Output
    return summary[0]['summary_text']

## Testing the Model

### Query based on the queries that came with the dataset

In [92]:
queries_to_use = random.sample(queries, 5)
print(queries_to_use)

['what is the salary of a person with a biology degree', 'what are monocytes', 'what is slime', 'temperature of neptune in fahrenheit', 'gayla name meaning']


In [93]:
preview = []
queries_embeddings = embedder.encode(queries_to_use, convert_to_tensor=True)
hit = util.semantic_search(queries_embeddings, passage_embeddings, top_k=1)

for i in range(len(queries_to_use)):
    query = clean_query(queries_to_use[i])

    # Get most relevant passage
    result = hit[i][0]
    corpus_id = result['corpus_id']  # Use corpus_id to find the passage
    passage = ' '.join(passages[corpus_id])

    # Extract the 3 most relevant sentences from the passage
    best_sentences = extract_relevant_sentence(query, passage, top_k=3)

    # Summarize the best sentences
    summary = summarizer(best_sentences, max_length=50, min_length=10)

    # Store the preview data for this query
    preview.append({
        "query": query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

Your max_length is set to 50, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


In [94]:
# Remove key 'summary_text'
for entry in preview:
    entry['summary'] = entry['summary'][0]['summary_text']

In [119]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'left')

df_preview = pd.DataFrame(preview)
df_preview_styled = df_preview.style.set_properties(**{'text-align': 'left'})

df_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,what is the salary of a person with a biology degree,"Biology majors who don’t attend a graduate program make a median salary of $51,000 per year, which is a little below the median salary for graduates from all other majors combined. The starting median pay for biology majors was $40,600 per year for those who specialized in microbiology, according to PayScale.com's 2010-2011 College Salary Report, while for those with a molecular biology focus it was $40,200 annually. Biology majors working as high school teachers made salaries between $36,801 and $48,519 annually.","Starting median pay for biology majors was $40,600 per year for those who specialized in microbiology. Biology majors working as high school teachers made salaries between $36,801 and $48,519 annually.",0.809996
1,what are monocytes,"Monocytes are a type of leukocyte or white blood cell which play a role in immune system function. Monocytes are a type of white blood cell, part of the human body 's immune system. Monocytes are a type of white blood cell that fights off bacteria, viruses and fungi.","Monocytes are a type of white blood cell that fights off bacteria, viruses and fungi. Monocytes are part of the human body 's immune system.",0.846941
2,what is slime,Slime is a substance that is slippery and sticky and commonly used as a toy. Slime is a unique play material composed of a cross-linked polymer. Slime is a (usually) green semi-viscous substance that has been synonymous with Nickelodeon since its introduction on You Can't Do That On Television.,Slime is a unique play material composed of a cross-linked polymer. It has been synonymous with Nickelodeon since its introduction on You Can't Do That On Television.,0.646297
3,temperature of neptune in fahrenheit,"The average temperature of Neptune is -200 degrees Celsius (C), which is -328 degrees Fahrenheit (F). However, Neptune's temperature can dip down to -218 degrees Celsius, wh … ich is -360 degrees Fahrenheit. The average temperature on Neptune is about minus 200 degrees Celsius (minus 392 degrees Fahrenheit).","The average temperature of Neptune is -200 degrees Celsius (C), which is -328 degrees Fahrenheit (F) However, Neptune's temperature can dip down to -218 degrees Celsius, wh … ich is -360 degrees Fahrenheit.",0.765028
4,gayla name meaning,In English the meaning of the name Gayla is: Festive party. In American the meaning of the name Gayla is: Festive party. American Meaning: The name Gayla is an American baby name.,Gayla is an American baby name. In English the meaning of the name Gayla is: Festive party.,0.901657


### Input your own query

In [96]:
my_query = clean_query(input("My Question: "))
my_query_embedding = embedder.encode([my_query], convert_to_tensor=True)
my_hits = util.semantic_search(my_query_embedding, passage_embeddings, top_k=1)

my_preview = []

for result in my_hits[0]: # Most relevant passage
    corpus_id = result["corpus_id"]
    passage = " ".join(passages[corpus_id])

    # Extract the 3 most relevant sentences
    best_sentences = extract_relevant_sentence(my_query, passage, top_k=3)

    # Summarize the best sentences
    summary = summarize_text(best_sentences, max_length=50, min_length=10)

    my_preview.append({
        "query": my_query,
        "best_sentences": best_sentences,
        "summary": summary,
        "query_passage_similarity_score": result["score"]
    })

My Question: How to be a rainbow?


In [120]:
df_my_preview = pd.DataFrame(my_preview)
df_my_preview_styled = df_my_preview.style.set_properties(**{'text-align': 'left'})

df_my_preview_styled

Unnamed: 0,query,best_sentences,summary,query_passage_similarity_score
0,how to be a rainbow,"Try adding brown and white to the yellow to get a gold kind of colour. Add a tiny amount of ultramarine blue to darken the hue while maintaining a light overall orange gold tone. Begin with a 2-to-1-to-1 ratio of yellow ochre, raw sienna and carmine.",Add a tiny amount of ultramarine blue to darken the hue while maintaining a light overall orange gold tone. Try adding brown and white to the yellow to get a gold kind of colour.,0.316554
