# Semantic Search Index

In [1]:
# !pip install datasets
# !pip install sentence-transformers

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

* Fetching the news dataset and selecting 2000 sample for this model

In [11]:
dataset = load_dataset("multi_news", split='test')
df = dataset.to_pandas()

In [16]:
df.shape

(5622, 2)

* A Normal Transformer will give vector representation for embedding tokens
* Sentence Transformer(SBERT) gives a single representation at once for the entire sentence
* Sentence transformers are used in search engines to match queries with relevant documents, enabling semantic search that goes beyond simple keyword matching.
* Its embeddings designed to compare sentence similarities

In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# Transfering the model to GPU
model = model.to(device)

In [18]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Generate Embeddings
* Here, we encode the article summaries into embeddings, transforming the textual information into a numerical format that's easier to analyze.

In [19]:
passage_embeddings = list(model.encode(df["summary"].to_list(), show_progress_bar=True))

Batches:   0%|          | 0/176 [00:00<?, ?it/s]

In [20]:
passage_embeddings[0].shape

(384,)

In [21]:
len(passage_embeddings)

5622

In [None]:
query = "Find me some articles about technology and Artificial Intelligence"

In [None]:
query_embedding = model.encode(query)

In [None]:
query_embedding.shape

In [None]:
similarities = util.cos_sim(query_embedding, passage_embeddings)

In [None]:
top_index = torch.topk(similarities.flatten(), k=3)
top_index

In [None]:
top_index = top_index.indices
top_index

In [None]:
top_relavent_passages = [df.iloc[x.item()]["summary"][:200] + "..." for x in top_index]
top_relavent_passages

In [22]:
def find_relavent_news(query:str):
    # Encode the query using the same model
    query_embedding = model.encode(query)

    # Calculate the cosine similarity between the query and passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Get the indices of the top 3 most similar passages
    top_index = torch.topk(similarities.flatten(), k=3).indices

    # Retrieve the summaries of the top 3 passages and truncate them to 160 characters
    top_relavent_passages = [df.iloc[x.item()]["summary"][:200] + "..." for x in top_index]
    
    return top_relavent_passages

In [23]:
find_relavent_news("Natural Disasters")

['– Harvey is getting its proper attention in the US, but another devastating flood is unfolding in a different part of the world. Monsoon rains have triggered flooding and mudslides that have left more...',
 '– The tsunami that killed hundreds, possibly thousands of people after an earthquake in Indonesia on Friday was much bigger and more devastating than would normally be expected after that kind of quak...',
 '– A rare outbreak of winter tornadoes has killed at least seven people in Missouri and Arkansas and left a trail of destruction across the South and Midwest. Three people were killed by a tornado in a...']

In [None]:
find_relavent_news("Law enforcement and police")

In [None]:
find_relavent_news("Anime News")

In [None]:
find_relavent_news("Politics, diplomacy and nationalism in india")

In [None]:
query = input("Enter the query or topic you need to know the news: ")
find_relavent_news(query)