In [2]:
import feedparser
import json


In [9]:
rss_url = "https://www.thedailystar.net/business/rss.xml"
feed = feedparser.parse(rss_url)

print("Entries Found:", len(feed.entries))

for entry in feed.entries[:5]:  # Print first 5 entries
    print("Title:", entry.title)
    print("Link:", entry.link)
    print("Published:", entry.published)
    print("Summary:", entry.summary)
    print("-" * 40)


Entries Found: 20
Title: ICMAB honours excellence in accounting and finance research
Link: https://www.thedailystar.net/business/organisation-news/news/icmab-honours-excellence-accounting-and-finance-research-4059981
Published: Tue, 16 Dec 2025 20:02:38 +0600
Summary: Quality research in accounting and finance is key to strengthening governance, transparency and accountability, said Hossain Zillur Rahman, chairperson and senior trustee of BRAC..He made the remarks while speaking as the chief guest at an award-giving ceremony, titled ‚ÄúICMAB Research Exce
----------------------------------------
Title: Al-Arafah Islami Bank donates Tk 8 lakh to Dhaka National Medical College
Link: https://www.thedailystar.net/business/banking/news/al-arafah-islami-bank-donates-tk-8-lakh-dhaka-national-medical-college-4059991
Published: Tue, 16 Dec 2025 19:54:39 +0600
Summary: Al-Arafah Islami Bank PLC has donated Tk 8 lakh to Dhaka National Medical College under its corporate social responsibility (CSR

In [None]:
rss_url = "https://www.thedailystar.net/business/rss.xml"
feed = feedparser.parse(rss_url)

docs = []

for i,entry in enumerate(feed.entries):
    title = getattr(entry, 'title', '')
    link = getattr(entry, 'link', '')
    summary = getattr(entry, 'summary', '')
    date = getattr(entry, 'published', '')

    doc = {
        "doc_id": f"en_{i:04d}",
        "title": title,
        "body": summary,
        "url": link,
        "date": date,
        "language": "en",
        "token_count": len(summary.split())
    }
    docs.append(doc)

saving_path = r"C:\Users\RAZER\Documents\Python Scripts\CLIR_Project\data"

with open(saving_path+"\document_en.json", "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2) 

print("Document data saved to document_en.json")


Document data saved to document_en.json


In [16]:
from tqdm import tqdm

def collect_from_rss_feeds(rss_feeds, max_docs=200):
    docs = []
    seen_urls = set()
    doc_i = 0

    for rss_url in tqdm(rss_feeds, desc="Processing RSS feeds"):
        feed = feedparser.parse(rss_url)

        for entry in feed.entries:
            url = getattr(entry, "link", "").strip()
            if not url or url in seen_urls:
                continue

            title = getattr(entry, "title", "").strip()
            summary = getattr(entry, "summary", "").strip()
            date = getattr(entry, "published", "")

            seen_urls.add(url)

            docs.append({
                "doc_id": f"en_{doc_i:06d}",
                "title": title,
                "body": summary,
                "url": url,
                "date": date,
                "language": "en",
                "token_count": len(summary.split())
            })
            doc_i += 1

            if len(docs) >= max_docs:
                return docs

    return docs


In [None]:
rss_feeds=[
    "https://www.thedailystar.net/frontpage/rss.xml",
    "https://www.thedailystar.net/news/bangladesh/rss.xml",
    "https://www.thedailystar.net/business/rss.xml",
    "https://www.thedailystar.net/sports/rss.xml",
    "https://www.thedailystar.net/entertainment/rss.xml",
]


docs_big = collect_from_rss_feeds(rss_feeds)
saving_path = r"C:\Users\RAZER\Documents\Python Scripts\CLIR_Project\data"

with open(saving_path+"\document_en_big.json", "w", encoding="utf-8") as f:
    json.dump(docs_big, f, ensure_ascii=False, indent=2)


print("Document data saved to document_en_big.json")


Processing RSS feeds: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:04<00:00,  1.10it/s]

Document data saved to document_en_big.json





# PREPROCESSING

## Step 2.1a ‚Äî English filtering (simple code)

In [15]:
import json

with open("C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\document_en.json", "r", encoding="utf-8") as f:
    docs_en = json.load(f)

print("Before filtering:", len(docs_en))

docs_en_clean = []


for d in docs_en:    
    if d["token_count"] >= 20:
        docs_en_clean.append(d) 



print("After filtering:", len(docs_en_clean))

with open("C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\documents_en_clean.json", "w", encoding="utf-8") as f:
    json.dump(docs_en_clean, f, ensure_ascii=False, indent=2)


Before filtering: 190
After filtering: 128


## Step 2.1b ‚Äî Bangla filtering (same idea)

In [8]:
import json

with open("C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\document_ba.json", "r", encoding="utf-8") as f:
    docs_bn = json.load(f)

print("Before filtering:", len(docs_bn))

docs_bn_clean = []

for d in docs_bn:    
    if d["token_count"] >= 20:
        docs_bn_clean.append(d)


print("After filtering:", len(docs_bn_clean))

with open("C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\documents_bn_clean.json", "w", encoding="utf-8") as f:
    json.dump(docs_bn_clean, f, ensure_ascii=False, indent=2)


Before filtering: 200
After filtering: 133


## Step 2.2 ‚Äî Normalize English text (VERY LIGHT)

Bangladesh ‚â† bangladesh

In [16]:
def normalize_english(text):
    return " ".join(text.lower().split())

for d in docs_en_clean:
    d["title"] = normalize_english(d["title"])
    d["body"] = normalize_english(d["body"])

with open("C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\documents_en_clean.json", "w", encoding="utf-8") as f:
    json.dump(docs_en_clean, f, ensure_ascii=False, indent=2)

print("English text normalized")


English text normalized


In [17]:
text = "Python   Is   Awesome"
query = "python is awesome"


print(text.lower() == query.lower())
print(" ".join(text.lower().split()) == " ".join(query.lower().split()))


False
True


## Step 2.3 ‚Äî Quick sanity check (important)

In [20]:
print(docs_en_clean[17]["title"])
print(docs_en_clean[17]["body"][:150])


‚Äòyou cannot remain silent‚Äô
the high court yesterday came down hard on bangladesh railway for mismanagement, ticket scalping and carrying passengers on the train roofs.


# STEP 3: INDEXING

Indexing means preparing your documents so the computer can search them fast.

--------------------------------------------------------------------------------------------

Without indexing:

the computer would read every article for every query ‚ùå

that is slow and impractical

With indexing:

the computer knows where each word appears

search becomes fast

--------------------------------------------------------------------------------------------------------------
Each document has:

doc_id

title

body

We index mainly:
üëâ title + body

--------------------------------------------------------------------------------
Types of indexing (high-level)

Your assignment expects at least one, preferably two:

üîπ A) Lexical Indexing (Keyword-based)

Examples:

TF-IDF

BM25 ‚úÖ (most popular, recommended)

This answers:

‚ÄúDoes this document contain the query words?‚Äù

üîπ B) Semantic Indexing (Meaning-based)

Examples:

sentence embeddings

multilingual embeddings

This answers:

‚ÄúDoes this document mean the same thing as the query?‚Äù

We will do A first, then B later.

----------------------------------------------------------------------------
5Ô∏è‚É£ BM25 ‚Äî explained very simply

BM25 is a smart keyword search.

It considers:
1Ô∏è‚É£ How many times a word appears
2Ô∏è‚É£ How rare the word is
3Ô∏è‚É£ How long the document is

------------------------------------------
Example

Query:Bangladesh cricket 


Document A:

Bangladesh cricket team won the match


‚Üí very relevant ‚úÖ

Document B:

Bangladesh economy is improving


‚Üí not relevant ‚ùå

BM25 gives higher score to A.

6Ô∏è‚É£ What BM25 produces

After indexing, you get:

Query ‚Üí [ (doc_id, score), (doc_id, score), ... ]


Example:

("en_00123", 4.56)
("en_00087", 3.91)
("en_00210", 2.88)


Higher score = more relevant.

7Ô∏è‚É£ Where indexing fits in the pipeline
Clean JSON
   ‚Üì
Index (BM25)
   ‚Üì
Fast Search
   ‚Üì
Ranked Results


Indexing does NOT change your data house.

 ## 3.1 ‚Äî BM25

In [None]:
!pip -q install rank-bm25


In [None]:
! pip install scipy pandas scikit-learn



In [25]:
import json
import re
from rank_bm25 import BM25Okapi


DATA_PATH = "C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\document_en.json"

with open(DATA_PATH, "r", encoding="utf-8") as f:
    docs = json.load(f)

print("Loaded docs:", len(docs))
print("Example keys:", list(docs[0].keys()))
print("Example title:", docs[0]["title"])


Loaded docs: 190
Example keys: ['doc_id', 'title', 'body', 'url', 'date', 'language', 'token_count']
Example title: AusChamp


## 3.2 ‚Äî Simple tokenizer (beginner-safe)

In [27]:
def tokenize_en(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()


## 3.3 Build BM25 index For English

In [36]:
corpus_tokens = []
doc_ids = []

for d in docs:
    doc_id = d.get("doc_id", "")
    title = d.get("title", "")
    body = d.get("body", "")

    #full_text = (title + " " + body).strip()
    full_text = (title + " ") * 3 + body

    tokens = tokenize_en(full_text)

    corpus_tokens.append(tokens)
    doc_ids.append(doc_id)

bm25 = BM25Okapi(corpus_tokens)

print("BM25 index built.")
print("Documents indexed:", len(doc_ids))


BM25 index built.
Documents indexed: 190


## 3.4 Search function

In [37]:
def bm25_search(query, top_k=5):
    q_tokens = tokenize_en(query)
    scores = bm25.get_scores(q_tokens)

    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    results = []
    for i in ranked_idx:
        results.append({
            "doc_id": doc_ids[i],
            "score": float(scores[i]),
            "title": docs[i].get("title", ""),
            "url": docs[i].get("url", "")
        })
    return results


In [30]:
results = bm25_search("Bangladesh cricket", top_k=5)

for r in results:
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])


5.490425641809947 | en_000067 | BPL Sylhet phase tickets to go on sale Sunday
   https://www.thedailystar.net/sports/cricket/news/bpl-sylhet-phase-tickets-go-sale-sunday-4062621
5.266652460927968 | en_000180 | Bangladesh cricket: Is the old culture of interference back?
   https://www.thedailystar.net/star-multimedia/sports-multimedia/news/bangladesh-cricket-the-old-culture-interference-back-4027566
4.899999449249533 | en_000068 | BCB expresses solidarity with The Daily Star, Prothom Alo
   https://www.thedailystar.net/sports/cricket/news/bcb-expresses-solidarity-the-daily-star-prothom-alo-4062611
4.843796995241667 | en_000139 | Sohan captain for Zim T20Is
   https://www.thedailystar.net/sports/cricket/news/sohan-captain-zim-t20is-3077041
4.677107173860765 | en_000178 | When will women athletes get safe space in Bangladesh?
   https://www.thedailystar.net/star-multimedia/sports-multimedia/news/when-will-women-athletes-get-safe-space-bangladesh-4032581


## 3.5 Save the index to reuse later

### 3.5.1 Save BM25 index

In [32]:
import pickle

with open("bm25_en.pkl", "wb") as f:
    pickle.dump({"bm25": bm25, "doc_ids": doc_ids, "docs": docs}, f)

print("Saved bm25_en.pkl")


Saved bm25_en.pkl


### 3.5.2 Load BM25 index later

In [33]:
import pickle

with open("bm25_en.pkl", "rb") as f:
    pack = pickle.load(f)

bm25 = pack["bm25"]
doc_ids = pack["doc_ids"]
docs = pack["docs"]

print("Loaded BM25 index with docs:", len(docs))


Loaded BM25 index with docs: 190


 ## 3.6 Build BM25 index For Bangla

In [34]:
DATA_PATH_BN = "C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\document_ba.json"

with open(DATA_PATH_BN, "r", encoding="utf-8") as f:
    docs_bn = json.load(f)

def tokenize_bn(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

corpus_tokens_bn = []
doc_ids_bn = []

for d in docs_bn:
    full_text = (d.get("title","") + " " + d.get("body","")).strip()
    corpus_tokens_bn.append(tokenize_bn(full_text))
    doc_ids_bn.append(d.get("doc_id",""))

bm25_bn = BM25Okapi(corpus_tokens_bn)

print("BM25 (Bangla) index built. Docs:", len(doc_ids_bn))


BM25 (Bangla) index built. Docs: 200


## 3.7 Bangla search

In [35]:
def bm25_search_bn(query, top_k=5):
    q_tokens = tokenize_bn(query)
    scores = bm25_bn.get_scores(q_tokens)
    ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    results = []
    for i in ranked_idx:
        results.append({
            "doc_id": doc_ids_bn[i],
            "score": float(scores[i]),
            "title": docs_bn[i].get("title", ""),
            "url": docs_bn[i].get("url", "")
        })
    return results

results_bn = bm25_search_bn("‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü", top_k=5)

for r in results_bn:
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])


7.5651130716395185 | ba_000131 | ‡¶ú‡¶ø‡¶Æ‡ßç‡¶¨‡¶æ‡¶¨‡ßÅ‡ßü‡ßá‡¶∞ ‡¶ü‡ßá‡¶∏‡ßç‡¶ü ‡¶ì ‡¶ì‡ßü‡¶æ‡¶®‡¶°‡ßá ‡¶®‡ßá‡¶§‡ßÉ‡¶§‡ßç‡¶¨‡ßá ‡¶™‡¶∞‡¶ø‡¶¨‡¶∞‡ßç‡¶§‡¶®, ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶™‡ßá‡¶≤‡ßá‡¶® ‡¶ï‡ßá?
   https://www.jagonews24.com/sports/cricket/1077727
6.555988274863213 | ba_000053 | ‡¶ü‡¶ø‡¶≠‡¶ø‡¶§‡ßá ‡¶Ü‡¶ú‡¶ï‡ßá‡¶∞ ‡¶ñ‡ßá‡¶≤‡¶æ
   https://www.risingbd.com/sports/news/632882
5.862020324656238 | ba_000021 | ‡¶ï‡¶®‡¶ì‡ßü‡ßá-‡¶≤‡ßç‡¶Ø‡¶æ‡¶•‡¶æ‡¶Æ‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶∞‡ßá‡¶ï‡¶∞‡ßç‡¶° ‡¶Ü‡¶∞ ‡¶â‡¶á‡¶®‡ßç‡¶°‡¶ø‡¶ú‡ßá‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶Ö‡¶∏‡¶Æ‡ßç‡¶≠‡¶¨‡ßá‡¶∞ ‡¶™‡¶æ‡¶π‡¶æ‡ßú
   https://www.risingbd.com/sports/news/632914
2.9889713948885794 | ba_000129 | ‡¶¢‡¶æ‡¶ï‡¶æ‡ßü ‡¶™‡ßá‡ßó‡¶Å‡¶õ‡ßá‡¶õ‡ßá ‡¶∂‡¶π‡ßÄ‡¶¶ ‡¶∂‡¶∞‡¶ø‡¶´ ‡¶ì‡¶∏‡¶Æ‡¶æ‡¶® ‡¶π‡¶æ‡¶¶‡¶ø‡¶∞ ‡¶Æ‡¶∞‡¶¶‡ßá‡¶π
   https://bangladeshdiplomat.com/11050/latest/%e0%a6%a2%e0%a6%be%e0%a6%95%e0%a6%be%e0%a6%af%e0%a6%bc-%e0%a6%aa%e0%a7%8c%e0%a6%81%e0%a6%9b%e0%a7%87%e0%a6%9b%e0%a7%87-%e0%a6%b6%e0%a6%b9%e0%a7%80%e0%a6%a6-%e0%a6%b6%e0%a6%b

# Step 3: Build & Save BM25 Indexes

In [None]:
!pip -q install rank-bm25
!pip install scipy pandas scikit-learn


## 3.1: Config

In [50]:

import json
import re
import pickle
from rank_bm25 import BM25Okapi


EN_PATH = "C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\documents_en_clean.json"
BN_PATH = "C:\\Users\\RAZER\\Documents\\Python Scripts\\CLIR_Project\\data\\documents_bn_clean.json"

EN_INDEX_OUT = "bm25_en.pkl"
BN_INDEX_OUT = "bm25_bn.pkl"


## 3.2: Tokenizers (simple + stable)

### 3.2.1 English tokenizer (lowercase + remove punctuation)

In [51]:
def tokenize_en(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()


### 3.2.1 Bangla tokenizer (whitespace only; keep Bangla characters)

In [52]:
def tokenize_bn(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()


## 3.3 Build + Save English BM25 index (CLEAN dataset)

In [53]:
with open(EN_PATH, "r", encoding="utf-8") as f:
    docs_en = json.load(f)

corpus_en = []
doc_ids_en = []

for d in docs_en:
    title = d.get("title", "")
    body = d.get("body", "")
    #full_text = (title + " " + body).strip()
    full_text = (title + " ") * 3 + body
    corpus_en.append(tokenize_en(full_text))
    doc_ids_en.append(d.get("doc_id", ""))

bm25_en = BM25Okapi(corpus_en)

with open(EN_INDEX_OUT, "wb") as f:
    pickle.dump({"bm25": bm25_en, "doc_ids": doc_ids_en, "docs": docs_en}, f)

print("English docs indexed:", len(doc_ids_en))
print("Saved:", EN_INDEX_OUT)


English docs indexed: 128
Saved: bm25_en.pkl


## 3.4 Cell 5 ‚Äî Build + Save Bangla BM25 index (CLEAN dataset)

In [54]:
with open(BN_PATH, "r", encoding="utf-8") as f:
    docs_bn = json.load(f)

corpus_bn = []
doc_ids_bn = []

for d in docs_bn:
    title = d.get("title", "")
    body = d.get("body", "")
    #full_text = (title + " " + body).strip()
    full_text = (title + " ") * 3 + body
    corpus_bn.append(tokenize_bn(full_text))
    doc_ids_bn.append(d.get("doc_id", ""))

bm25_bn = BM25Okapi(corpus_bn)

with open(BN_INDEX_OUT, "wb") as f:
    pickle.dump({"bm25": bm25_bn, "doc_ids": doc_ids_bn, "docs": docs_bn}, f)

print("Bangla docs indexed:", len(doc_ids_bn))
print("Saved:", BN_INDEX_OUT)


Bangla docs indexed: 133
Saved: bm25_bn.pkl


## 3.5 Load indexes + Search functions (deployment usage)

In [55]:
def load_index(path):
    with open(path, "rb") as f:
        return pickle.load(f)

en_pack = load_index(EN_INDEX_OUT)
bn_pack = load_index(BN_INDEX_OUT)

bm25_en = en_pack["bm25"]
doc_ids_en = en_pack["doc_ids"]
docs_en = en_pack["docs"]

bm25_bn = bn_pack["bm25"]
doc_ids_bn = bn_pack["doc_ids"]
docs_bn = bn_pack["docs"]

def search_en(query, top_k=5):
    q = tokenize_en(query)
    scores = bm25_en.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_en[i], "score": float(scores[i]), "title": docs_en[i].get("title",""), "url": docs_en[i].get("url","")} for i in idx]

def search_bn(query, top_k=5):
    q = tokenize_bn(query)
    scores = bm25_bn.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_bn[i], "score": float(scores[i]), "title": docs_bn[i].get("title",""), "url": docs_bn[i].get("url","")} for i in idx]


## 3.6 Quick test (just to confirm deployment works)

In [56]:
for r in search_en("Bangladesh cricket", top_k=5):
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])


7.716028235618145 | en_000180 | bangladesh cricket: is the old culture of interference back?
   https://www.thedailystar.net/star-multimedia/sports-multimedia/news/bangladesh-cricket-the-old-culture-interference-back-4027566
5.178472903032165 | en_000067 | bpl sylhet phase tickets to go on sale sunday
   https://www.thedailystar.net/sports/cricket/news/bpl-sylhet-phase-tickets-go-sale-sunday-4062621
5.067799875247182 | en_000178 | when will women athletes get safe space in bangladesh?
   https://www.thedailystar.net/star-multimedia/sports-multimedia/news/when-will-women-athletes-get-safe-space-bangladesh-4032581
4.897754626590153 | en_000139 | sohan captain for zim t20is
   https://www.thedailystar.net/sports/cricket/news/sohan-captain-zim-t20is-3077041
4.6381378954469 | en_000068 | bcb expresses solidarity with the daily star, prothom alo
   https://www.thedailystar.net/sports/cricket/news/bcb-expresses-solidarity-the-daily-star-prothom-alo-4062611


In [57]:
for r in search_bn("‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü", top_k=5):
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print("  ", r["url"])


8.072128349257582 | ba_000131 | ‡¶ú‡¶ø‡¶Æ‡ßç‡¶¨‡¶æ‡¶¨‡ßÅ‡ßü‡ßá‡¶∞ ‡¶ü‡ßá‡¶∏‡ßç‡¶ü ‡¶ì ‡¶ì‡ßü‡¶æ‡¶®‡¶°‡ßá ‡¶®‡ßá‡¶§‡ßÉ‡¶§‡ßç‡¶¨‡ßá ‡¶™‡¶∞‡¶ø‡¶¨‡¶∞‡ßç‡¶§‡¶®, ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶™‡ßá‡¶≤‡ßá‡¶® ‡¶ï‡ßá?
   https://www.jagonews24.com/sports/cricket/1077727
5.89117003360505 | ba_000021 | ‡¶ï‡¶®‡¶ì‡ßü‡ßá-‡¶≤‡ßç‡¶Ø‡¶æ‡¶•‡¶æ‡¶Æ‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶∞‡ßá‡¶ï‡¶∞‡ßç‡¶° ‡¶Ü‡¶∞ ‡¶â‡¶á‡¶®‡ßç‡¶°‡¶ø‡¶ú‡ßá‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶Ö‡¶∏‡¶Æ‡ßç‡¶≠‡¶¨‡ßá‡¶∞ ‡¶™‡¶æ‡¶π‡¶æ‡ßú
   https://www.risingbd.com/sports/news/632914
2.426167855952932 | ba_000078 | ‡¶ï‡¶≤‡¶ï‡¶æ‡¶§‡¶æ‡ßü ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶â‡¶™-‡¶¶‡ßÇ‡¶§‡¶æ‡¶¨‡¶æ‡¶∏‡ßá‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶¨‡¶ø‡¶ï‡ßç‡¶∑‡ßã‡¶≠
   https://www.risingbd.com/international/news/632857
2.416695470321029 | ba_000001 | ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶π‡¶æ‡¶á‡¶ï‡¶Æ‡¶ø‡¶∂‡¶®‡ßá‡¶∞ ‡¶¨‡¶æ‡¶á‡¶∞‡ßá ‡¶¨‡ßç‡¶Ø‡¶æ‡¶∞‡¶ø‡¶ï‡ßá‡¶° ‡¶≠‡¶æ‡¶ô‡¶æ‡¶∞ ‡¶ö‡ßá‡¶∑‡ßç‡¶ü‡¶æ ‡¶π‡ßü‡¶®‡¶ø: ‡¶≠‡¶æ‡¶∞‡¶§
   https://www.risingbd.com/international/news/632934
2.263514804100391 | ba_000129 | ‡¶¢‡¶æ‡¶ï‡¶

# Step 4 (Actual Deployment): BN‚ÜîEN CLIR using Translation + BM25

In [58]:
!pip -q install transformers sentencepiece sacremoses


In [59]:
!pip -q install torch


# 4.1 Load saved BM25 indexes (use Step 3 files)

In [1]:
import pickle
import re

def load_index(path):
    with open(path, "rb") as f:
        return pickle.load(f)

en_pack = load_index("bm25_en.pkl")
bn_pack = load_index("bm25_bn.pkl")

bm25_en = en_pack["bm25"]
doc_ids_en = en_pack["doc_ids"]
docs_en = en_pack["docs"]

bm25_bn = bn_pack["bm25"]
doc_ids_bn = bn_pack["doc_ids"]
docs_bn = bn_pack["docs"]

def tokenize_en(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

def tokenize_bn(text):
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

def search_en(query, top_k=5):
    q = tokenize_en(query)
    scores = bm25_en.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_en[i], "score": float(scores[i]), "title": docs_en[i].get("title",""), "url": docs_en[i].get("url","")} for i in idx]

def search_bn(query, top_k=5):
    q = tokenize_bn(query)
    scores = bm25_bn.get_scores(q)
    idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [{"doc_id": doc_ids_bn[i], "score": float(scores[i]), "title": docs_bn[i].get("title",""), "url": docs_bn[i].get("url","")} for i in idx]

print("Indexes loaded:", len(docs_en), "EN docs |", len(docs_bn), "BN docs")


Indexes loaded: 128 EN docs | 133 BN docs


## 4.2 Language detection (Bangla vs English) (simple + reliable)

In [3]:
def is_bangla(text: str) -> bool:
    # Bangla Unicode block: U+0980‚ÄìU+09FF
    for ch in text:
        o = ord(ch)
        if 0x0980 <= o <= 0x09FF: #Checks if the character‚Äôs Unicode number is in the Bangla range
            return True
    return False


## 4.3 Load BN‚ÜîEN translation models (MarianMT / OPUS-MT)

In [4]:
from transformers import MarianMTModel, MarianTokenizer

BN_EN_NAME = "Helsinki-NLP/opus-mt-bn-en"
EN_BN_NAME = "shhossain/opus-mt-en-to-bn"

tok_bn_en = MarianTokenizer.from_pretrained(BN_EN_NAME)
mod_bn_en = MarianMTModel.from_pretrained(BN_EN_NAME)

tok_en_bn = MarianTokenizer.from_pretrained(EN_BN_NAME)
mod_en_bn = MarianMTModel.from_pretrained(EN_BN_NAME)

def translate_bn_to_en(text: str) -> str:
    batch = tok_bn_en([text], return_tensors="pt", padding=True, truncation=True)
    gen = mod_bn_en.generate(**batch, max_new_tokens=128)
    return tok_bn_en.batch_decode(gen, skip_special_tokens=True)[0]

def translate_en_to_bn(text: str) -> str:
    batch = tok_en_bn([text], return_tensors="pt", padding=True, truncation=True)
    gen = mod_en_bn.generate(**batch, max_new_tokens=128)
    return tok_en_bn.batch_decode(gen, skip_special_tokens=True)[0]

print("Translation models loaded.")


  from .autonotebook import tqdm as notebook_tqdm


Translation models loaded.


In [6]:
print(translate_bn_to_en("‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü"))
print(translate_en_to_bn("My name is Sadia"))


Bangladesh Cricket
‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶®‡¶æ‡¶Æ ‡¶∏‡¶æ‡¶¶‡¶ø‡¶Ø‡¶º‡¶æ


## 4.4 CLIR search function (this is Step 4)

In [7]:
def clir_search(query: str, top_k=5):
    if is_bangla(query):
        q_en = translate_bn_to_en(query)
        results = search_en(q_en, top_k=top_k)
        return {
            "query_original": query,
            "query_language": "bn",
            "query_translated": q_en,
            "target_language": "en",
            "results": results
        }
    else:
        q_bn = translate_en_to_bn(query)
        results = search_bn(q_bn, top_k=top_k)
        return {
            "query_original": query,
            "query_language": "en",
            "query_translated": q_bn,
            "target_language": "bn",
            "results": results
        }


In [14]:
def _confidence_from_scores(results, k=10):
    if not results:
        return 0.0
    top = results[0].get("score", 0.0)
    tail = results[min(k-1, len(results)-1)].get("score", 0.0)
    if top <= 0:
        return 0.0
    conf = (top - tail) / (abs(top) + 1e-9)
    if conf < 0:
        conf = 0.0
    if conf > 1:
        conf = 1.0
    return float(conf)

def clir_search_with_conf(query: str, top_k=10):
    out = clir_search(query, top_k=top_k)
    conf = _confidence_from_scores(out["results"], k=top_k)
    out["confidence"] = conf
    out["warning"] = "LOW CONFIDENCE" if conf < 0.15 else ""
    return out


out = clir_search_with_conf("‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü", top_k=10)
print(out["query_original"])
print(out["query_translated"])
print("confidence:", out["confidence"], out["warning"])
print(out["results"][0]["title"] if out["results"] else "NO RESULTS")



‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü
Bangladesh Cricket
confidence: 0.7920799444355129 
bangladesh cricket: is the old culture of interference back?


## 4.5 Run real CLIR tests (one BN‚ÜíEN and one EN‚ÜíBN)

### 4.5.1 Bangla query retrieving English docs

In [8]:
out = clir_search("‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü", top_k=5)
print("Original:", out["query_original"])
print("Translated:", out["query_translated"])
print("Target:", out["target_language"])
print("-" * 80)
for r in out["results"]:
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print(" ", r["url"])


Original: ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü
Translated: Bangladesh Cricket
Target: en
--------------------------------------------------------------------------------
7.716028235618145 | en_000180 | bangladesh cricket: is the old culture of interference back?
  https://www.thedailystar.net/star-multimedia/sports-multimedia/news/bangladesh-cricket-the-old-culture-interference-back-4027566
5.178472903032165 | en_000067 | bpl sylhet phase tickets to go on sale sunday
  https://www.thedailystar.net/sports/cricket/news/bpl-sylhet-phase-tickets-go-sale-sunday-4062621
5.067799875247182 | en_000178 | when will women athletes get safe space in bangladesh?
  https://www.thedailystar.net/star-multimedia/sports-multimedia/news/when-will-women-athletes-get-safe-space-bangladesh-4032581
4.897754626590153 | en_000139 | sohan captain for zim t20is
  https://www.thedailystar.net/sports/cricket/news/sohan-captain-zim-t20is-3077041
4.6381378954469 | en_000068 | bcb expresses solidarity with 

### 4.5.2 English query retrieving Bangla docs

In [9]:
out = clir_search("Bangladesh economy", top_k=5)
print("Original:", out["query_original"])
print("Translated:", out["query_translated"])
print("Target:", out["target_language"])
print("-" * 80)
for r in out["results"]:
    print(r["score"], "|", r["doc_id"], "|", r["title"])
    print(" ", r["url"])


Original: Bangladesh economy
Translated: ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ö‡¶∞‡ßç‡¶•‡¶®‡ßÄ‡¶§‡¶ø
Target: bn
--------------------------------------------------------------------------------
2.426167855952932 | ba_000078 | ‡¶ï‡¶≤‡¶ï‡¶æ‡¶§‡¶æ‡ßü ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶â‡¶™-‡¶¶‡ßÇ‡¶§‡¶æ‡¶¨‡¶æ‡¶∏‡ßá‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶¨‡¶ø‡¶ï‡ßç‡¶∑‡ßã‡¶≠
  https://www.risingbd.com/international/news/632857
2.416695470321029 | ba_000001 | ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶π‡¶æ‡¶á‡¶ï‡¶Æ‡¶ø‡¶∂‡¶®‡ßá‡¶∞ ‡¶¨‡¶æ‡¶á‡¶∞‡ßá ‡¶¨‡ßç‡¶Ø‡¶æ‡¶∞‡¶ø‡¶ï‡ßá‡¶° ‡¶≠‡¶æ‡¶ô‡¶æ‡¶∞ ‡¶ö‡ßá‡¶∑‡ßç‡¶ü‡¶æ ‡¶π‡ßü‡¶®‡¶ø: ‡¶≠‡¶æ‡¶∞‡¶§
  https://www.risingbd.com/international/news/632934
2.263514804100391 | ba_000129 | ‡¶¢‡¶æ‡¶ï‡¶æ‡ßü ‡¶™‡ßá‡ßó‡¶Å‡¶õ‡ßá‡¶õ‡ßá ‡¶∂‡¶π‡ßÄ‡¶¶ ‡¶∂‡¶∞‡¶ø‡¶´ ‡¶ì‡¶∏‡¶Æ‡¶æ‡¶® ‡¶π‡¶æ‡¶¶‡¶ø‡¶∞ ‡¶Æ‡¶∞‡¶¶‡ßá‡¶π
  https://bangladeshdiplomat.com/11050/latest/%e0%a6%a2%e0%a6%be%e0%a6%95%e0%a6%be%e0%a6%af%e0%a6%bc-%e0%a6%aa%e0%a7%8c%e0%a6%81%e0%a6%9b%e0%a7%87%e0%a6%9b%e0%a7%87-%e0%a6%b6%e0%a6%b9%e0%a7%80%e0%a6%a6-%e0%a6%b6%e0%a6%

# Step 5

In [10]:
import json
import csv
import math
import pandas as pd


## 5.1 Define query set (manual list, easiest & reliable)

In [11]:
queries = [
    {"qid": "q001", "query": "‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü", "source_lang": "bn"},
    {"qid": "q002", "query": "‡¶¶‡¶æ‡¶Æ ‡¶¨‡ßá‡ßú‡ßá‡¶õ‡ßá ‡¶ö‡¶æ‡¶≤", "source_lang": "bn"},
    {"qid": "q003", "query": "Rohingya crisis", "source_lang": "en"},
    {"qid": "q004", "query": "Bangladesh economy", "source_lang": "en"},
    {"qid": "q005", "query": "‡¶ö‡¶ü‡ßç‡¶ü‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡¶Ø‡¶º", "source_lang": "bn"},
    {"qid": "q006", "query": "climate change Bangladesh", "source_lang": "en"},
]

with open("queries.json", "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print("Saved queries.json with", len(queries), "queries")


Saved queries.json with 6 queries


## 5.2 Load queries.json

In [12]:
with open("queries.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

print("Loaded queries:", len(queries))
print("Example:", queries[0])


Loaded queries: 6
Example: {'qid': 'q001', 'query': '‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü', 'source_lang': 'bn'}


## 5.3 Run your CLIR system and export

In [17]:
RUN_PATH = "run_top10.csv"
TOP_K = 10

rows = []

for q in queries:
    qid = q["qid"]
    query_text = q["query"]

    out = clir_search_with_conf(query_text, top_k=TOP_K)

    for rank, r in enumerate(out["results"], start=1):
        rows.append({
            "qid": qid,
            "query": out["query_original"],
            "source_lang": out["query_language"],
            "translated_query": out["query_translated"],
            "target_lang": out["target_language"],
            "confidence": out["confidence"],
            "rank": rank,
            "doc_id": r.get("doc_id",""),
            "score": r.get("score", 0.0),
            "title": r.get("title",""),
            "url": r.get("url",""),
        })

import pandas as pd
df_run = pd.DataFrame(rows)
df_run.to_csv(RUN_PATH, index=False, encoding="utf-8-sig")

print("Saved:", RUN_PATH)
df_run.head(5)


Saved: run_top10.csv


Unnamed: 0,qid,query,source_lang,translated_query,target_lang,confidence,rank,doc_id,score,title,url
0,q001,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü,bn,Bangladesh Cricket,en,0.79208,1,en_000180,7.716028,bangladesh cricket: is the old culture of inte...,https://www.thedailystar.net/star-multimedia/s...
1,q001,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü,bn,Bangladesh Cricket,en,0.79208,2,en_000067,5.178473,bpl sylhet phase tickets to go on sale sunday,https://www.thedailystar.net/sports/cricket/ne...
2,q001,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü,bn,Bangladesh Cricket,en,0.79208,3,en_000178,5.0678,when will women athletes get safe space in ban...,https://www.thedailystar.net/star-multimedia/s...
3,q001,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü,bn,Bangladesh Cricket,en,0.79208,4,en_000139,4.897755,sohan captain for zim t20is,https://www.thedailystar.net/sports/cricket/ne...
4,q001,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü,bn,Bangladesh Cricket,en,0.79208,5,en_000068,4.638138,"bcb expresses solidarity with the daily star, ...",https://www.thedailystar.net/sports/cricket/ne...


## 5.4 Create a labeling template

In [18]:
QRELS_TEMPLATE = "qrels_template.csv"

df_qrels = df_run[["qid", "doc_id", "rank", "title", "url"]].copy()
df_qrels["relevance"] = ""  # you fill: 1 or 0

df_qrels.to_csv(QRELS_TEMPLATE, index=False, encoding="utf-8-sig")

print("Saved:", QRELS_TEMPLATE)
print("Fill the 'relevance' column with 1 (relevant) or 0 (not relevant).")
df_qrels.head(5)


Saved: qrels_template.csv
Fill the 'relevance' column with 1 (relevant) or 0 (not relevant).


Unnamed: 0,qid,doc_id,rank,title,url,relevance
0,q001,en_000180,1,bangladesh cricket: is the old culture of inte...,https://www.thedailystar.net/star-multimedia/s...,
1,q001,en_000067,2,bpl sylhet phase tickets to go on sale sunday,https://www.thedailystar.net/sports/cricket/ne...,
2,q001,en_000178,3,when will women athletes get safe space in ban...,https://www.thedailystar.net/star-multimedia/s...,
3,q001,en_000139,4,sohan captain for zim t20is,https://www.thedailystar.net/sports/cricket/ne...,
4,q001,en_000068,5,"bcb expresses solidarity with the daily star, ...",https://www.thedailystar.net/sports/cricket/ne...,
