<a href="https://colab.research.google.com/github/SulemanShahani/Similarity-Search-on-Reuters-news-Data-Set-from-Hugging-Face/blob/main/Similarity_Search_on_Reuters_news_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

import faiss

# Load the Reuters news dataset from the Hugging Face hub
from langchain.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader("reuters21578", page_content_column="text", name="ModLewis")
data = loader.load()

# Splitting the data into chunks (only using top 500 articles for simplicity)
top_500_articles = data[:500]

# Initialize the RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=0)

# Split documents into chunks
docs = splitter.split_documents(top_500_articles)
text_list = [doc.page_content for doc in docs]

# Load the tokenizer and embedding model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")

# Load the tokenizer and generation model
generation_tokenizer = AutoTokenizer.from_pretrained("t5-small")
generation_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Function to create embeddings
def create_embeddings(texts):
    embeddings = []
    for text in texts:
        tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**tokens)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    return np.vstack(embeddings)

# Create embeddings for documents
document_embeddings = create_embeddings(text_list)

# Create FAISS index
index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)

# Function to generate query embedding
def generate_query_embedding(query):
    tokens = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to retrieve documents
def retrieve_documents(query_embedding, k=5):
    distances, indices = index.search(query_embedding, k)
    return [text_list[i] for i in indices[0]]

# Function to generate response
def generate_response(query, retrieved_docs):
    context = " ".join(retrieved_docs)
    input_text = f"question: {query} context: {context}"
    input_ids = generation_tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    outputs = generation_model.generate(input_ids, max_length=100)
    return generation_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Interactive function
def interact():
    while True:
        query = input("Enter your question (or type 'quit' to exit): ")
        if query.lower() == "quit":
            break
        query_embedding = generate_query_embedding(query)
        retrieved_docs = retrieve_documents(query_embedding)
        response = generate_response(query, retrieved_docs)
        print(f"Response: {response}")

# Run interaction
interact()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

The repository for reuters21578 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/reuters21578.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/8.15M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6188 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/13625 [00:00<?, ? examples/s]

Generating unused split:   0%|          | 0/722 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Enter your question (or type 'quit' to exit): 
Response: Neste Oy, Celanese CanadanInc, Hoechst Celanese Corporation and in the March 31 second quarter
Enter your question (or type 'quit' to exit): japnese
Response: Japan to the General Agreement onnTariffs and Trade last year
Enter your question (or type 'quit' to exit): Ukraine
Response: Ukraine
Enter your question (or type 'quit' to exit): trade
Response: imports and settle outstanding trade issues
Enter your question (or type 'quit' to exit): Japan
Response: Japan has until the next meeting of senior officials fromnthe two countries in May or June to come up with a Japan's electric powernin the fiscal year ended March 31, supplying an estimated 27npct on a kilowatt/hour basis
Enter your question (or type 'quit' to exit): 
Response: Neste Oy, Celanese CanadanInc, Hoechst Celanese Corporation and in the March 31 second quarter
Enter your question (or type 'quit' to exit): Holand gdp
Response: The report said Thailand benefited from t

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics.pairwise import cosine_similarity

# Function to evaluate system using quantitative metrics
def evaluate_system(test_queries, expected_responses):
    bleu_scores = []
    cosine_similarities = []

    for query, expected_response in zip(test_queries, expected_responses):
        query_embedding = generate_query_embedding(query)
        retrieved_docs = retrieve_documents(query_embedding)
        generated_response = generate_response(query, retrieved_docs)

        # Calculate BLEU score
        bleu_score = sentence_bleu([expected_response.split()], generated_response.split(), smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu_score)

        # Calculate cosine similarity between expected and generated responses
        expected_embedding = generate_query_embedding(expected_response)
        generated_embedding = generate_query_embedding(generated_response)
        cosine_sim = cosine_similarity(expected_embedding.reshape(1, -1), generated_embedding.reshape(1, -1))[0][0]
        cosine_similarities.append(cosine_sim)

        # Print the results for inspection
        print(f"Query: {query}")
        print(f"Generated Response: {generated_response}")
        print(f"Expected Response: {expected_response}")
        print(f"BLEU Score: {bleu_score}")
        print(f"Cosine Similarity: {cosine_sim}")
        print("=" * 50)

    # Print average scores
    print(f"Average BLEU Score: {np.mean(bleu_scores)}")
    print(f"Average Cosine Similarity: {np.mean(cosine_similarities)}")

# Test queries and expected responses for evaluation
test_queries = [
    "What are the latest developments in technology?",
    "How has the stock market performed this week?",
    "What are the implications of climate change on agriculture?",
    "Explain the impact of recent geopolitical events on global trade.",
    "How has the COVID-19 pandemic affected the travel industry?"
]

expected_responses = [
    "Latest developments in technology include advancements in AI and robotics.",
    "The stock market has shown fluctuations this week due to economic uncertainty.",
    "Climate change poses significant challenges for agriculture, affecting crop yields and water availability.",
    "Recent geopolitical events have led to fluctuations in global trade, impacting economies worldwide.",
    "The travel industry has been severely affected by the COVID-19 pandemic, leading to travel restrictions and reduced tourism."
]

# Evaluate the system
evaluate_system(test_queries, expected_responses)

# Run interaction
interact()

Query: What are the latest developments in technology?
Generated Response: television and audio equipment and computers
Expected Response: Latest developments in technology include advancements in AI and robotics.
BLEU Score: 0.020960166113993737
Cosine Similarity: 0.5676347017288208
Query: How has the stock market performed this week?
Generated Response: rose sharply
Expected Response: The stock market has shown fluctuations this week due to economic uncertainty.
BLEU Score: 0
Cosine Similarity: 0.5019999742507935
Query: What are the implications of climate change on agriculture?
Generated Response: a greater role innstimulating economic growth
Expected Response: Climate change poses significant challenges for agriculture, affecting crop yields and water availability.
BLEU Score: 0
Cosine Similarity: 0.5311249494552612
Query: Explain the impact of recent geopolitical events on global trade.
Generated Response: trade dispute between the United States and Japan
Expected Response: Recent

In [2]:
 #!pip install faiss-cpu
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensio

In [6]:

!pip install langchain
!pip install datasets

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.10-py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.8/332.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from lang