In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = {
    "id": [1, 2, 3, 4, 5],
    "title": [
        "Stock market rises",
        "New smartphone launched",
        "Football team wins final",
        "Inflation eases slightly",
        "AI tool boosts productivity"
    ],
    "content": [
        "Stocks climbed today as investors reacted to earnings and lower inflation.",
        "The company launched a new phone featuring an upgraded camera and faster chip.",
        "The team won the championship after a dramatic penalty shootout.",
        "Consumer prices rose at a slower pace this month, raising hopes of rate cuts.",
        "A new AI assistant helps automate emails, summaries, and scheduling for teams."
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,id,title,content
0,1,Stock market rises,Stocks climbed today as investors reacted to e...
1,2,New smartphone launched,The company launched a new phone featuring an ...
2,3,Football team wins final,The team won the championship after a dramatic...
3,4,Inflation eases slightly,Consumer prices rose at a slower pace this mon...
4,5,AI tool boosts productivity,"A new AI assistant helps automate emails, summ..."


In [4]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text

df["combined"] = (df["title"] + " " + df["content"]).apply(clean_text)
df[["id", "combined"]].head()

Unnamed: 0,id,combined
0,1,stock market rises stocks climbed today as inv...
1,2,new smartphone launched the company launched a...
2,3,football team wins final the team won the cham...
3,4,inflation eases slightly consumer prices rose ...
4,5,ai tool boosts productivity a new ai assistant...


In [5]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["combined"])

tfidf_matrix.shape

(5, 53)

In [6]:
def get_top_similar_articles(query, df, vectorizer, tfidf_matrix, top_k=3):
    query_clean = clean_text(query)
    query_vec = vectorizer.transform([query_clean])
    
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:top_k]
    
    results = df.iloc[top_indices].copy()
    results["similarity"] = similarities[top_indices]
    
    return results[["title", "similarity", "content"]]

In [7]:
query_article = """
Stocks rose sharply today after major companies reported strong quarterly earnings.
Investors are optimistic that inflation is slowing down.
"""

results = get_top_similar_articles(query_article, df, vectorizer, tfidf_matrix)
results

Unnamed: 0,title,similarity,content
0,Stock market rises,0.599495,Stocks climbed today as investors reacted to e...
3,Inflation eases slightly,0.195256,Consumer prices rose at a slower pace this mon...
4,AI tool boosts productivity,0.0,"A new AI assistant helps automate emails, summ..."


In [8]:
for i, row in results.iterrows():
    print(f"Top {i}")
    print("Title:", row["title"])
    print("Similarity:", round(row["similarity"], 3))
    print("Content:", row["content"])
    print("-" * 50)

Top 0
Title: Stock market rises
Similarity: 0.599
Content: Stocks climbed today as investors reacted to earnings and lower inflation.
--------------------------------------------------
Top 3
Title: Inflation eases slightly
Similarity: 0.195
Content: Consumer prices rose at a slower pace this month, raising hopes of rate cuts.
--------------------------------------------------
Top 4
Title: AI tool boosts productivity
Similarity: 0.0
Content: A new AI assistant helps automate emails, summaries, and scheduling for teams.
--------------------------------------------------
