In [None]:
# Installing the necessary libraries
!pip install datasets==2.14.0
!pip install torch[cpu]
!pip install sentence-transformers==2.2.2

Collecting datasets==2.14.0
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.0)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.14.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets==2.14.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determin

### 📚 Step 1: Import Libraries

First, let's import the necessary libraries that will empower us to manipulate datasets, process text, and perform mathematical operations.

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch
import os

### 🗂️ Step 2: Load the Dataset

We load the multi_news dataset, focusing on the 'test' split to efficiently manage our resources.

In [None]:
dataset = load_dataset("multi_news", split="test")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Dataset({
    features: ['document', 'summary'],
    num_rows: 5622
})

### 📊 Step 3: Data Preparation

To ensure our analysis is manageable and efficient, we'll focus on a subset of 2000 random samples from our dataset.

In [None]:
df = dataset.to_pandas().sample(2000, random_state=42)
df

Unnamed: 0,document,summary
4830,Tweet with a location \n \n You can add locati...,– Denis Finley has taken to Twitter to call Po...
1255,CNN host Piers Morgan just called to discuss h...,– CNN's Piers Morgan thinks gun-rights propone...
80,White House communications director Anthony Sc...,– New White House communications director Anth...
3044,CLOSE Scientists say they've found archaeologi...,– Scientists say they have the first physical ...
4486,Click image above to view graphic \n \n Althou...,– Scientists are calling it a breakthrough and...
...,...,...
2157,"On Thursday afternoon, President-elect Donald ...","– He who pays the piper calls the tune, and it..."
3615,Donald Trump said Sunday that in the wake of t...,– In the wake of the Orlando shooting one week...
2751,Nashua police believe body found is that of mi...,"– Sad news out of Nashua, NH, after police say..."
622,The public school systems in New York and Los ...,"– Some 640,000 kids in the nation's second-lar..."


### 🧠 Step 4: Load the Model

Next, we load a pre-trained Sentence Transformer model. This model will help us convert textual data into dense vectors (embeddings) that capture the essence of our text.

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### 🔍 Step 5: Generate Embeddings

Here, we encode the article summaries into embeddings, transforming the textual information into a numerical format that's easier to analyze.

In [None]:
passage_embeddings = list(model.encode(df['summary'].to_list(), show_progress_bar=True))
passage_embeddings[0].shape

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

(384,)

### 🎯 Step 6: Define a Query

Let's specify a query for which we want to find relevant articles. This will be our target for similarity searches.

In [None]:
query = "Find me some recent articles about personal finance"

### 📏 Step 7: Find Relevant Articles

To find articles that match our query, we compute the cosine similarity between the query embedding and all article embeddings, retrieving the top 3 most relevant articles.

In [None]:
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, passage_embeddings)

top_indices = torch.topk(similarities.flatten(), 3).indices
top_relevant_passages = [df.iloc[x.item()]['summary'][:200] + "..." for x in top_indices]
top_relevant_passages

["– It's that time of year when financial experts are asked to dust off their crystal balls and tea leaves and offer up their predictions for the economy in 2018. At Yahoo Finance, David Nelson warns th...",
 "– Banks loaned Ted Cruz as much as $1 million during his first Senate campaign in Texas back in 2012, but you wouldn't know it from campaign finance reports. While Cruz eventually disclosed the loans ...",
 "– Bogged down by some $465 million in debt, Reader's Digest publisher RDA Holdings has filed for bankruptcy—again. Last night's Chapter 11 filing was the publisher's second in 3 ½ years, the Wall Stre..."]

### 🛠️ Utility Function: Find Relevant News

To simplify the process of finding relevant articles for any query, we encapsulate our code in a function. This function takes a query, processes it, and returns the top 3 relevant articles.

In [None]:
def find_relevant_news(query):
    # Encode the query using the same model
    query_embedding = model.encode(query)

    # Calculate the cosine similarity between the query and passage embeddings
    similarities = util.cos_sim(query_embedding, passage_embeddings)

    # Get the indices of the top 3 most similar passages
    top_indices = torch.topk(similarities.flatten(), 3).indices

    # Retrieve the summaries of the top 3 passages and truncate them to 160 characters
    top_relevant_passages = [df.iloc[x.item()]["summary"][:160] + "..." for x in top_indices]

    return top_relevant_passages

### Conclusion & Next Steps

This notebook illustrates the power of NLP in extracting relevant information from large text datasets using sentence embeddings and cosine similarity. Explore further by modifying the query or adjusting the function to suit different needs.

In [None]:
# Example queries to explore
print(find_relevant_news("Technology and AI"))
print(find_relevant_news("Law enforcement and police"))
print(find_relevant_news("Politics, diplomacy and nationalism"))

['– The "tech surge" to fix HealthCare.gov includes some names from the industry\'s biggest players. Among them, per a Health department blog post, is Michael Dick...', '– Ford has made another big bet on the future of self-driving cars, investing $1 billion in previously unknown startup Argo AI. "The next decade will be defined...', '– Are you a "digital native" or a "digital immigrant," and does it make a difference? Research recently published in the Teaching and Teacher Education journal ...']
['– The war of words between Chicago and the federal government over "sanctuary cities" policy is heating up. Attorney General Jeff Sessions slammed the city\'s le...', '– Greg Barnes was in a hurry to get home on Friday, so when he saw police lights behind him on State Road 332 in Muncie, Indiana, "immediately I knew I was in t...', '– "We are not thugs. We are professionals," says the leader of a black policing group, addressing a speech in which President Trump urged officers to not be "to

In [None]:
def clear_screen():
    os.system("clear")

In [None]:
def interactive_search():
    print("Welcome to the Semantic News Search!\n")
    while True:
        print("Type in a topic you'd like to find articles about, and I'll do the searching! \n (Type 'exit' to quit)\n> ", end="")

        query = input().strip()

        if query.lower() == "exit":
            print("\nThanks for using the Semantic News Search! Have a great day!")
            break

        print("\n\tHere are 3 articles I found based on your query: \n")

        passages = find_relevant_news(query)
        for passage in passages:
            print("\n\t" + passage)

        input("\nPress Enter to continue searching...")
        clear_screen()

In [None]:
# Start the interactive search
interactive_search()

Welcome to the Semantic News Search!

Type in a topic you'd like to find articles about, and I'll do the searching! (Type 'exit' to quit)
> 

	Here are 3 articles I found based on your query: 


	– ObamaCare needs those wild and crazy millennials to get on board if it's going to work, and a Colorado ad campaign that previously drew scorn over ads featuri...

	– Some Baltimore residents aren't standing by idly in the mess after yesterday's violence. Volunteers took to the streets today to clean up their "Comeback City...

	– It's springtime in Vacationland, and Mainers are celebrating the return of sunshine with a dose of their acerbic wit. As the Bangor Daily News reports, MaineD...
