In [1]:
# TF-IDF + BERT EMBEDDINGS


In [2]:
!pip install sentence-transformers




In [3]:
# IMPORT LIBRARIES

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")

In [4]:
# LOAD DATASET

dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print("Total Documents Loaded:", len(documents))


Total Documents Loaded: 11314


In [5]:
# PART 1 — TF-IDF BASED SIMILARITY

print("\nBuilding TF-IDF Model...")

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

def find_similar_tfidf(article_index, top_n=3):
    cosine_sim = cosine_similarity(tfidf_matrix[article_index], tfidf_matrix).flatten()
    cosine_sim[article_index] = 0
    similar_indices = cosine_sim.argsort()[-top_n:][::-1]
    return similar_indices


Building TF-IDF Model...


In [6]:
# PART 2 — BERT EMBEDDING BASED SIMILARITY

print("Loading BERT Model... (may take 1-2 minutes first time)")

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
bert_embeddings = model.encode(documents, show_progress_bar=True)

def find_similar_bert(article_index, top_n=3):
    cosine_sim = cosine_similarity(
        [bert_embeddings[article_index]],
        bert_embeddings
    ).flatten()
    
    cosine_sim[article_index] = 0
    similar_indices = cosine_sim.argsort()[-top_n:][::-1]
    return similar_indices

Loading BERT Model... (may take 1-2 minutes first time)




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/354 [00:00<?, ?it/s]

In [7]:

# TEST EXAMPLE

index = 10   # You can change this

print("\n===================================================")
print("INPUT ARTICLE:")
print("===================================================\n")
print(documents[index][:500])

# -------------------------
# TF-IDF Results
# -------------------------

tfidf_results = find_similar_tfidf(index)

print("\n\n==============================")
print("TOP 3 Similar Articles (TF-IDF)")
print("==============================")

for i in tfidf_results:
    print(f"\n--- Article Index {i} ---\n")
    print(documents[i][:400])


INPUT ARTICLE:

I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs
very well, paint is the bronze/brown/orange faded out, leaks a bit of oil
and pops out of 1st with hard accel.  The shop will fix trans and oil 
leak.  They sold the bike to the 1 and only owner.  They want $3495, and
I am thinking more like $3K.  Any opinions out there?  Please email me.
Thanks.  It would be a nice stable mate to the Beemer.  Then I'll get
a jap bike and call myself Axis Motors!

-- 
----------------------


TOP 3 Similar Articles (TF-IDF)

--- Article Index 3543 ---


Now you know why I am just a DOD member.  I like bikes and clubs but
the politics and other b*llsh*t is a real turn-off.
-- 
-----------------------------------------------------------------------
"Tuba" (Irwin)      "I honk therefore I am"     CompuTrac-Richardson,Tx
irwin@cmptrc.lonestar.org    DoD #0826          (R75/6)

--- Article Index 9171 ---


More like those who use their backs instead of their minds to m

In [8]:
# BERT Results

bert_results = find_similar_bert(index)

print("\n\n==============================")
print("TOP 3 Similar Articles (BERT)")
print("==============================")

for i in bert_results:
    print(f"\n--- Article Index {i} ---\n")
    print(documents[i][:400])




TOP 3 Similar Articles (BERT)

--- Article Index 10085 ---

I spoke to a sales dweeb in 3X, a Ducati dealer here in Blighty, and he had
nothing good to say about them... it appears they are waaaay underpowered,
(basically, it's the 750/900 with a 400cc engine), and there have been some
quality problems (rusty _frame_ !!).  Save your pennies... buy the 900 :)


--- Article Index 9090 ---

FOR SALE (RELUCTANTLY)
                  ---- Classic Bike -----
                 1972 YAMAHA XS-2 650 TWIN
 
<6000 Original miles. Always stored inside. 1979 front end with
aftermarket tapered steering head bearings. Racer's supply rear
bronze swingarm bushings, Tsubaki chain, Pirrhana 1/4 fairing
with headlight cutout, one-up Carrera racing seat, superbike bars,
velo stacks on twin carbs. Also h

--- Article Index 388 ---

Hi boys and girls.  I just bought a Beemer R80GS and realized abruptly that 
I am a grad student.  I first sold my truck yesterday but I need to sell my 
Zephyr too.

If I can se