# Semantic Search in Articles (NLP) — End‑to‑End Pipeline
- Preprocessing articles
- Baseline keyword extraction using TF‑IDF
- Semantic search with TF‑IDF
- embeddings with Sentence‑BERT (semantic search)
- Speed-up with FAISS



## Setup

In [None]:
!pip install pandas numpy scikit-learn nltk sentence-transformers keybert rake-nltk
!pip install faiss-cpu

## Imports & NLTK setup

In [19]:
import os
import re
import ast
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already present
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print('Setup complete.')


Setup complete.


## Load Dataset

In [3]:

DATA_PATH = "/content/bbc_news_20220307_20240703.csv"

df = pd.read_csv(DATA_PATH)

df.head()


Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [20]:
df.columns

Index(['title', 'pubDate', 'guid', 'link', 'description', 'news_clean',
       'title_clean'],
      dtype='object')

## Preprocessing
We apply basic text normalization: lowercasing, removing non-letters, stopword removal, and lemmatization.

In [5]:
_stopwords = set(stopwords.words('english'))
_lemmatizer = WordNetLemmatizer()

# # Download punkt_tab resource
nltk.download('punkt_tab', quiet=True)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)          # remove HTML tags
    text = re.sub(r"[^a-z\s]", " ", text)        # keep letters and spaces

    # tokenizatian
    tokens = nltk.word_tokenize(text)

    # remove stopwords
    tokens = [t for t in tokens if t not in _stopwords and len(t) > 2]

    # lemmatization
    lemmas = [_lemmatizer.lemmatize(t) for t in tokens]

    return " ".join(lemmas)

df['news_clean'] = df['description'].astype(str).apply(clean_text)
df['title_clean'] = df['title'].astype(str).apply(clean_text)

In [7]:
df[['title','title_clean']].head()

Unnamed: 0,title,title_clean
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine angry zelensky vow punish russian atro...
1,War in Ukraine: Taking cover in a town under a...,war ukraine taking cover town attack
2,Ukraine war 'catastrophic for global food',ukraine war catastrophic global food
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing saffie roussos parent...
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict oil price soar highest level ...


## Baseline: TF‑IDF Keyword Extraction
We build a TF‑IDF vectorizer and extract top terms per document.

In [9]:

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=5000,
    min_df=1,
    stop_words='english'
)

X_tfidf = tfidf_vectorizer.fit_transform(df['news_clean'])

feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

def top_tfidf_terms_for_doc(row_index: int, top_n: int = 10):
    row = X_tfidf.getrow(row_index)
    if row.nnz == 0:
        return []
    # Get indices sorted by score descending
    sorted_idx = np.argsort(row.toarray()[0])[::-1]
    top_idx = sorted_idx[:top_n]
    return [(feature_names[i], row[0, i]) for i in top_idx]

# Example: show top terms for first docs
for i in range(1):
    print(f"\nDoc {i} — {df.loc[i, 'title']}")
    print([w for w,_ in top_tfidf_terms_for_doc(i, top_n=8)])



Doc 0 — Ukraine: Angry Zelensky vows to punish Russian atrocities
['ukrainian president', 'president say', 'civilian', 'murder', 'ukrainian', 'country', 'president', 'say']


## Baseline: TF‑IDF Semantic Search
We vectorize the corpus with TF‑IDF and compare a query via cosine similarity.

In [22]:

def tfidf_search(query: str, top_k: int = 5):
    q_clean = clean_text(query)
    q_vec = tfidf_vectorizer.transform([q_clean])
    sims = cosine_similarity(q_vec, X_tfidf)[0]
    top_idx = np.argsort(sims)[::-1][:top_k]
    results = df.iloc[top_idx][['news_clean']].copy()
    results['score'] = sims[top_idx]
    return results

# Quick demo
tfidf_search("higher energy costs", top_k=3)


Unnamed: 0,news_clean,score
7234,people turning renewable energy way cut energy...,0.640383
4,consumer feeling impact higher energy cost fue...,0.579986
62,consumer feeling impact higher energy cost fue...,0.579986


## Sentence‑BERT Embeddings for Semantic Search

In [None]:
from sentence_transformers import SentenceTransformer

embed_model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(embed_model_name)



In [13]:
corpus_embeddings = embedder.encode(df['news_clean'].tolist(), batch_size=32, convert_to_numpy=True, normalize_embeddings=True)

# Use NearestNeighbors with cosine distance (1 - cosine_similarity)
nn = NearestNeighbors(n_neighbors=10, metric='cosine')
nn.fit(corpus_embeddings)


In [15]:
def sbert_search(query: str, top_k: int = 5):
    q_vec = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    distances, indices = nn.kneighbors(q_vec, n_neighbors=top_k)
    indices = indices[0]
    distances = distances[0]
    # Convert cosine distance to similarity
    sims = 1 - distances
    results = df.iloc[indices][['title','news_clean']].copy()
    results['score'] = sims
    return results

sbert_search("higher energy costs", top_k=3)

Unnamed: 0,title,news_clean,score
7891,Rich to get twice as much cost-of-living suppo...,resolution foundation say limit energy cost ne...,0.694362
10089,"Schools to cut staff in budget squeeze, union ...",funding increasing cost pay energy rising fast,0.647827
11654,What is the UK inflation rate and why is the c...,cost living increasing fastest rate year due r...,0.620408


## Speed-up with FAISS

In [23]:
import faiss
dim = corpus_embeddings.shape[1]

index = faiss.IndexFlatIP(dim)  # inner product on normalized vectors ≈ cosine similarity
index.add(corpus_embeddings.astype('float32'))

def faiss_search(query: str, top_k: int = 5):
    q_vec = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype('float32')
    scores, idx = index.search(q_vec, top_k)
    results = df.iloc[idx[0]][['title','news_clean']].copy()
    results['score'] = scores[0]
    return results


faiss_search("higher energy costs", top_k=3)


Unnamed: 0,title,news_clean,score
7891,Rich to get twice as much cost-of-living suppo...,resolution foundation say limit energy cost ne...,0.694362
10089,"Schools to cut staff in budget squeeze, union ...",funding increasing cost pay energy rising fast,0.647827
9950,What is the UK inflation rate and why is the c...,cost living increasing fastest rate year due r...,0.620408
