In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sentence_transformers import SentenceTransformer

In [3]:
from sklearn.datasets import fetch_20newsgroups

newsgroup_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)


In [4]:
df = pd.DataFrame({'text':newsgroup_train.data,'category': newsgroup_train.target})

In [5]:
df.head()

Unnamed: 0,text,category
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [6]:
df.shape

(11314, 2)

In [7]:
len(newsgroup_train.target_names)

20

In [8]:
print(f"TEXT: {df['text'][0]}\nCategory:\n\t{newsgroup_train.target_names[df['category'][0]]}")

TEXT: From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





Category:
	rec.autos


In [9]:
model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embedding_vector = joblib.load('embeddings.joblib')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
def preprocess_text(text):
  text = text.strip()
  return text



def cosine_similar(v1, array_of_vector):
  v1 = np.array(v1)
  similarities = []

  if len(array_of_vector.shape) == 1:
    array_of_vector = [array_of_vector]

  for v2 in array_of_vector:
    v2 = np.array(v2)
    similarities.append(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

  return similarities




def top_k_greatest_indices(lst, k):
  indexed_list = list(enumerate(lst))
  sorted_by_value = sorted(indexed_list, key=lambda x: x[1], reverse=True)
  top_k_index = [index for index, value in sorted_by_value[:k]]
  return top_k_index




In [12]:
def retrieve_documents(query, embedings, model, top_k):
  query_clean = preprocess_text(query)
  query_embeding = model.encode(query_clean, convert_to_tensor=True)

  cosine_score = []
  for x in embedings:
    cosine_score.append(cosine_similar(query_embeding, x))
  top_result = top_k_greatest_indices(cosine_score, top_k)

  print(f"Query : {query}")
  for x in top_result:
    print(f"Document: {df.iloc[x]['text'][:200]}...")
    print(f"Category: {newsgroup_train.target_names[df.iloc[x]['category']]}...")
    print("\n\n")


example_query = "space exploration"
retrieve_documents(example_query, embedding_vector, model, 5)

Query : space exploration
Document: From: u1452@penelope.sdsc.edu (Jeff Bytof - SIO)
Subject: End of the Space Age?
Organization: San Diego Supercomputer Center @ UCSD
Lines: 16
Distribution: world
NNTP-Posting-Host: penelope.sdsc.edu

...
Category: sci.space...



Document: From: dennisn@ecs.comm.mot.com (Dennis Newkirk)
Subject: Space class for teachers near Chicago
Organization: Motorola
Distribution: usa
Nntp-Posting-Host: 145.1.146.43
Lines: 59

I am posting this for...
Category: sci.space...



Document: From: Wales.Larrison@ofa123.fidonet.org
Subject: Commercial Space News #22
X-Sender: newtout 0.08 Feb 23 1993
Lines: 666

COMMERCIAL SPACE NEWS/SPACE TECHNOLOGY INVESTOR NUMBER 22

   This is number t...
Category: sci.space...



Document: Subject: Space FAQ 01/15 - Introduction
From: leech@cs.unc.edu (Jon Leech)
Expires: 6 May 1993 19:53:44 GMT
Organization: University of North Carolina, Chapel Hill
Keywords: Frequently Asked Questions...
Category: sci.space...



Document: From

In [13]:
def precision(tp, tn, fp, fn):
  if tp<0 or tn<0 or fp<0 or fn<0:
    raise ValueError("All input values must be non-negative")

  if tp + fp == 0:
    return 0.0
  return tp / (tp + fp)


def recall(tp, tn, fp, fn):
  if tp<0 or tn<0 or fp<0 or fn<0:
    raise ValueError("All input values must be non-negative")

  if tp + fn == 0:
    return 0.0

  return tp / (tp + fn)


In [14]:
test_queries = [
    {"query": "advancements in space exploration technology", "desired_category": "sci.space"},
    {"query": "real-time rendering techniques in computer graphics", "desired_category": "comp.graphics"},
    {"query": "latest findings in cardiovascular medical research", "desired_category": "sci.med"},
    {"query": "NHL playoffs and team performance statistics", "desired_category": "rec.sport.hockey"},
    {"query": "impacts of cryptography in online security", "desired_category": "sci.crypt"},
    {"query": "the role of electronics in modern computing devices", "desired_category": "sci.electronics"},
    {"query": "motorcycles maintenance tips for enthusiasts", "desired_category": "rec.motorcycles"},
    {"query": "high-performance baseball tactics for championships", "desired_category": "rec.sport.baseball"},
    {"query": "historical influence of politics on society", "desired_category": "talk.politics.misc"},
    {"query": "latest technology trends in the Windows operating system", "desired_category": "comp.os.ms-windows.misc"}

]

In [15]:
def compute_metrics(queries, embeddings, model, top_k=5):
  result = []

  for item in queries:
    query = item['query']
    desired_category = item['desired_category']


    query_clean = preprocess_text(query)
    query_embedding = model.encode(query_clean, convert_to_tensor=True)

    cosine_score = []

    for x in embeddings:
      cosine_score.append(cosine_similar(query_embedding, x))

    top_results = top_k_greatest_indices(cosine_score, top_k)

    retrieved_categories = [
            newsgroup_train.target_names[df.iloc[idx]["category"]] for idx in top_results
    ]

    true_positive = sum(1 for cat in retrieved_categories if cat == desired_category)
    false_positive = top_k - true_positive

    false_negatives = sum(
            newsgroup_train.target_names[df.iloc[idx]["category"]] == desired_category
            for idx in top_results
        ) - true_positive

    true_negative = 0

    p = precision(true_positive, true_negative, false_positive, false_negatives)
    r = recall(true_positive, true_negative, false_positive, false_negatives)

    result.append({
        "query": query,
        "precision": p,
        "recall": r
    })

  return result


In [16]:
result = compute_metrics(test_queries, embedding_vector, model, top_k=5)
print("Results: ")
for result in result:
  print(f"Query: {result['query']}")
  print(f"Precision: {result['precision']}")
  print(f"Recall: {result['recall']}")
  print("\n")

Results: 
Query: advancements in space exploration technology
Precision: 1.0
Recall: 1.0


Query: real-time rendering techniques in computer graphics
Precision: 1.0
Recall: 1.0


Query: latest findings in cardiovascular medical research
Precision: 1.0
Recall: 1.0


Query: NHL playoffs and team performance statistics
Precision: 1.0
Recall: 1.0


Query: impacts of cryptography in online security
Precision: 1.0
Recall: 1.0


Query: the role of electronics in modern computing devices
Precision: 1.0
Recall: 1.0


Query: motorcycles maintenance tips for enthusiasts
Precision: 1.0
Recall: 1.0


Query: high-performance baseball tactics for championships
Precision: 1.0
Recall: 1.0


Query: historical influence of politics on society
Precision: 0.4
Recall: 1.0


Query: latest technology trends in the Windows operating system
Precision: 0.8
Recall: 1.0


