In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
dataset = fetch_20newsgroups(remove=('headers','footers','quotes'))
documents = dataset.data[:1000]   # limit for speed

print("Total documents:", len(documents))


Total documents: 1000


In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (1000, 24138)


In [4]:
def get_similar_articles(query, top_n=3):
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    
    top_indices = similarity_scores.argsort()[0][-top_n:][::-1]
    
    print("\nInput Article:\n")
    print(query[:500])
    
    print("\nTop Similar Articles:\n")
    for i, index in enumerate(top_indices):
        print(f"\nResult {i+1}:")
        print(documents[index][:500])



In [5]:
sample_query = documents[10]
get_similar_articles(sample_query)



Input Article:

I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs
very well, paint is the bronze/brown/orange faded out, leaks a bit of oil
and pops out of 1st with hard accel.  The shop will fix trans and oil 
leak.  They sold the bike to the 1 and only owner.  They want $3495, and
I am thinking more like $3K.  Any opinions out there?  Please email me.
Thanks.  It would be a nice stable mate to the Beemer.  Then I'll get
a jap bike and call myself Axis Motors!

-- 
----------------------

Top Similar Articles:


Result 1:
I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs
very well, paint is the bronze/brown/orange faded out, leaks a bit of oil
and pops out of 1st with hard accel.  The shop will fix trans and oil 
leak.  They sold the bike to the 1 and only owner.  They want $3495, and
I am thinking more like $3K.  Any opinions out there?  Please email me.
Thanks.  It would be a nice stable mate to the Beemer.  Then I'll get
a jap bike and

In [6]:
custom_query = input("Enter a news article: ")
get_similar_articles(custom_query)


Enter a news article:  custom_query = input("Enter a news article: ") get_similar_articles(custom_query)



Input Article:

custom_query = input("Enter a news article: ") get_similar_articles(custom_query)

Top Similar Articles:


Result 1:

 There was an article on clari.news.religion in the last few days about a
Polish tribunal decision. It said that crucifixes and religious classes in
public schools were okay; and that children who did not want to take religion
class could not be forced to take an ethics class as a substitute.

Result 2:
Hello,
     I am looking to add voice input capability to a user interface I am
developing on an HP730 (UNIX) workstation.  I would greatly appreciate 
information anyone would care to offer about voice input systems that are 
easily accessible from the UNIX environment. 

     The names or adresses of applicable vendors, as well as any 
experiences you have had with specific systems, would be very helpful.

     Please respond via email; I will post a summary if there is 
sufficient interest.



Result 3:
Some weeks ago, someone posted an article tellin