In [230]:
import fitz  # PyMuPDF
import re
from langdetect import detect
from typing import List,Tuple
import pandas as pd
from IPython.display import display

In [231]:
# Configuration
PDF_PATH = "../data/HSC26-Bangla1st-Paper.pdf"     #PDF Path
COLLECTION_NAME = "bangla_book"
CHUNK_SIZE = 300  # words
CHUNK_OVERLAP = 100  # words
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

print(f"PDF Path: {PDF_PATH}")
print(f"Chunk Size: {CHUNK_SIZE} words")
print(f"Overlap: {CHUNK_OVERLAP} words")
print(f"Model: {MODEL_NAME}")

PDF Path: ../data/HSC26-Bangla1st-Paper.pdf
Chunk Size: 300 words
Overlap: 100 words
Model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


In [232]:
# Text cleaning functions
def clean_extracted_text(text: str) -> str:
    """Clean and normalize extracted text"""
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove page numbers and headers/footers
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        line = line.strip()
        # Skip very short lines that might be page numbers or artifacts
        if len(line) > 3 and not line.isdigit():
            cleaned_lines.append(line)
    
    text = ' '.join(cleaned_lines)
    
    # Normalize Bangla punctuation
    text = re.sub(r'[‡•§]{2,}', '‡•§', text)
    
    return text.strip()      

In [233]:
def detect_language_segments(text: str) -> List[Tuple[str, str]]:
    """Detect language segments in mixed Bangla-English text"""
    # Split by sentences using both Bangla and English sentence endings
    sentences = re.split(r'[‡•§.!?]+', text)
    segments = []
    
    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) < 5:  # Skip very short segments
            continue
            
        try:
            # Detect language
            lang = detect(sentence)
            # Map language codes
            if lang == 'bn':
                lang = 'bangla'
            elif lang == 'en':
                lang = 'english'
            else:
                lang = 'mixed'
                
            segments.append((sentence, lang))
        except:
            # If detection fails, mark as mixed
            segments.append((sentence, 'mixed'))
    
    return segments


In [234]:
# Extract text from PDF
print(f"Extracting text from: {PDF_PATH}")

doc = fitz.open(PDF_PATH)
pages_data = []

for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text = page.get_text()
    
    # Clean extracted text
    text = clean_extracted_text(text)
    
    if text.strip():  # Only add non-empty pages
        pages_data.append({
            'page_number': page_num + 1,
            'text': text,
            'word_count': len(text.split())
        })

doc.close()

print(f"Extracted text from {len(pages_data)} pages")

Extracting text from: ../data/HSC26-Bangla1st-Paper.pdf
Extracted text from 49 pages


In [235]:
# Display sample page
if pages_data:
    sample_page = pages_data[0]
    print(f"\nSample from Page {sample_page['page_number']} ({sample_page['word_count']} words):")
    print(f"'{sample_page['text'][:200]}...'")


Sample from Page 1 (14 words):
'‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡¶ï‡¶æ‡¶ï‡¶®‡¶æ ‡¶ú‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶æ‡¶∏‡¶æ , ‡¶Ö‡¶™‡¶∞‡¶ø‡¶∞‡¶ø‡¶§‡¶æ ‡¶Ü‡¶≤ ‡¶æ‡¶ø‡¶Ø ‡¶∞‡¶ø‡¶∑‡ßü ‡¶ø‡¶æ‡¶æ‡¶Ç ‡¶æ ‡ßß‡¶Æ ‡¶™‡¶§‡ßç‡¶∞...'


In [236]:
# Analyze extracted content
total_words = sum(page['word_count'] for page in pages_data)
avg_words_per_page = total_words / len(pages_data) if pages_data else 0

print(f"Content Analysis:")
print(f"   Total Pages: {len(pages_data)}")
print(f"   Total Words: {total_words}")
print(f"   Average Words per Page: {avg_words_per_page:.1f}")

# Create a quick visualization
page_stats = pd.DataFrame(pages_data)
display(page_stats.head(10))

Content Analysis:
   Total Pages: 49
   Total Words: 7261
   Average Words per Page: 148.2


Unnamed: 0,page_number,text,word_count
0,1,"‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡¶ï‡¶æ‡¶ï‡¶®‡¶æ ‡¶ú‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶æ‡¶∏‡¶æ , ‡¶Ö‡¶™...",14
1,2,‡ßß‡•§ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶≤‡ßá‡¶ø ‡¶ø‡¶æ‡¶ø‡¶æ ‡¶ï‡ßÄ ‡¶ï‡¶≤‡¶ø ‡¶ú‡ßÄ‡¶∞‡¶ø‡¶ï‡¶æ ‡¶∞‡¶®‡¶ø‡¶¨‡¶æ‡¶π ‡¶ï‡¶ø‡¶≤‡¶§‡¶®? ‡¶ï)...,202
2,3,‡¶∂‡¶¨‡ßç‡¶¶‡¶æ‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ü‡ßÄ‡¶ï‡¶æ ‡ßá‡ßÇ ‡¶∂‡¶¨‡ßç‡¶¶ ‡¶∂‡¶≤‡¶¨‡ßç‡¶¶‡¶ø ‡¶Ö‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ø‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ ...,271
3,4,‡¶∂‡¶¨‡ßç‡¶¶‡¶æ‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ü‡ßÄ‡¶ï‡¶æ ‡ßá‡ßÇ ‡¶∂‡¶¨‡ßç‡¶¶ ‡¶∂‡¶≤‡¶¨‡ßç‡¶¶‡¶ø ‡¶Ö‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ø‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ ...,210
4,5,‡¶∂‡¶¨‡ßç‡¶¶‡¶æ‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ü‡ßÄ‡¶ï‡¶æ ‡ßá‡ßÇ ‡¶∂‡¶¨‡ßç‡¶¶ ‡¶∂‡¶≤‡¶¨‡ßç‡¶¶‡¶ø ‡¶Ö‡¶∞‡ßç‡¶¨ ‡¶ì ‡¶ø‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ ...,72
5,6,‡ßá‡ßÇ ‡¶ó‡ßç‡ßá ‡¶Ü‡¶ø‡¶Ü‡¶Æ‡¶æ‡¶ø‡¶¨‡ßç ‡¶∏‡¶∏‡¶æ‡¶§‡¶æ‡¶ø‡¶Æ‡¶æ‡¶§‡ßç‡¶∞‡•§‡¶è‡¶ø‡ßÄ‡¶¨‡ßç‡¶®‡¶ü‡¶æ‡¶®‡¶æ‡¶¶‡¶¶‡¶ï‡¶ò‡¶ø‡¶Ø‡¶ø‡¶∞...,112
6,7,‡¶Ü‡¶Æ‡¶æ‡¶ø‡¶π‡¶∞‡ßç‡¶ø‡¶ø‡¶ï‡¶æ‡¶®‡¶™‡ßÅ‡¶ï‡¶ø‡¶ï‡¶æ‡¶ø‡¶ï‡¶ï‡¶ø‡•§‡¶Ø‡¶∏‡¶ø‡ßÅ‡¶ü‡¶ø‡¶ï‡¶§‡¶ï‡¶ú‡¶≤‡¶ï‡¶æ‡¶§‡¶æ ‡¶Ü‡¶∞‡ßç‡¶∏ ‡¶æ‡¶Ü...,120
7,8,‚Äú‡¶Æ‡¶®‡ßç‡¶¶‡¶® ‡¶Ø‡¶π! ‡¶ñ‡¶æ‡¶ü‡¶ø‡¶Ø‡¶∏‡¶æ‡¶®‡¶æ ‡¶¨‡ßç‡¶ï‡¶ü!‚Äù ‡¶∞‡ßç‡¶¨‡ßç‡¶®‡ßÅ‡¶¶‡¶æ‡¶¶‡¶æ‡¶ø‡¶≠‡¶æ‡¶∑‡¶æ‡¶ü‡¶æ‡¶Ö...,143
8,9,‡¶Æ‡¶æ‡¶Æ‡¶æ‡¶∞‡ßç‡¶¨‡ßç‡¶¨‡ßç‡¶æ‡¶π-‡¶¨‡ßç‡¶æ‡¶∞‡ßç‡¶º‡ßá‡¶ï‡¶§ ‡ßÅ‡¶∞‡ßç‡¶ï ‡¶æ‡¶ñ‡ßÅ‡¶∞‡ßç‡¶ø‡¶π‡¶á‡¶ï‡¶≤‡¶®‡¶®‡¶æ‡•§‡¶è‡¶ï‡¶ï‡¶Ø...,121
9,10,‡¶è‡¶á‡¶¨‡ßç‡¶ú‡¶≤ ‡¶æ‡¶Ø‡ßá‡¶Æ‡¶ï‡¶ø‡¶Æ‡ßÅ‡¶ñ‡¶æ‡¶Ø‡¶Æ‡¶æ‡¶ü‡¶æ‡¶è‡¶ï‡¶ñ‡¶æ‡¶®‡¶æ‡¶¨‡ßç‡¶æ‡¶≤‡¶æ ‡¶è‡¶ï‡¶ü‡ßÅ‡¶ö‡¶æ‡¶™‡¶∞‡ßç‡¶¶ ‡¶æ...,104


In [237]:
pages_data[1]['text']


"‡ßß‡•§ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶≤‡ßá‡¶ø ‡¶ø‡¶æ‡¶ø‡¶æ ‡¶ï‡ßÄ ‡¶ï‡¶≤‡¶ø ‡¶ú‡ßÄ‡¶∞‡¶ø‡¶ï‡¶æ ‡¶∞‡¶®‡¶ø‡¶¨‡¶æ‡¶π ‡¶ï‡¶ø‡¶≤‡¶§‡¶®? ‡¶ï) ‡¶°‡¶æ‡¶ï‡ßç‡¶§‡¶æ‡¶∞‡ßç‡¶ø ‡¶ñ) ‡¶ì‡¶ï‡¶æ‡¶≤‡¶∞‡ßç‡¶§ ‡¶ó) ‡¶Æ‡¶æ‡¶∏‡ßç‡¶ü‡¶æ‡¶∞‡ßç‡¶ø ‡¶ò) ‡¶¨‡ßç‡¶Ø‡¶¨‡ßç‡¶∏‡¶æ ‡ß®‡•§ ‡ßá‡¶æ‡ßá‡¶æ‡¶≤‡¶ï ‡¶≠‡¶æ‡¶ó‡ßç‡¶Ø ‡¶¶‡ßá‡¶ø‡¶§‡¶æ‡¶ø ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶è‡¶≤‡¶ú‡¶®‡ßç‡¶ü ‡¶ø ‡¶æ‡¶ø ‡¶ï‡¶æ‡¶ø‡¶£, ‡¶§‡¶æ‡¶ø- ‡¶ï) ‡¶™‡ßç‡¶∞‡¶∞‡ßç‡¶§‡¶™‡¶ú‡¶ø ‡¶ñ) ‡¶™‡ßç‡¶∞‡¶≠‡¶æ‡¶¨‡ßç ‡¶ó) ‡¶∞‡ßç‡¶¨‡ßç‡¶ö‡¶ï‡ßç‡¶∑‡¶£‡¶§‡¶æ ‡¶ò) ‡¶ï‡ßÇ‡¶ü ‡¶¨‡ßç‡ßÅ‡¶∞‡ßç‡¶ø ‡¶∞‡ßç‡¶®‡¶ï‡¶ö‡¶ø ‡¶Ö‡¶®‡ßÅ‡¶ï‡ßá‡¶¶‡¶ü‡¶ø ‡¶™‡¶ï‡¶º‡ßá ‡ß© ‡¶ì ‡ß™ ‡¶∏‡¶Ç‡¶ñ‡¶Ø‡¶ï ‡¶™‡ßç‡¶∞‡¶ï‡ßá‡¶ø ‡¶â‡¶ø‡¶ø ‡¶¶‡¶æ‡¶ì‡•§ ‡¶∞‡ßç‡¶™‡¶§‡ßÉ‡¶π‡ßÄ‡¶® ‡¶¶‡ßÄ‡¶™‡ßÅ‡¶ø ‡¶ö‡¶æ‡¶ö‡¶æ‡¶á ‡¶∞‡ßç‡¶ø‡¶ï‡¶≤‡¶® ‡¶™‡¶∞‡ßç‡¶ø‡¶¨‡ßç‡¶æ‡¶ï‡¶ø‡¶ø ‡¶ï‡¶§‡¶ø‡¶æ‡•§ ‡¶¶‡ßÄ‡¶™‡ßÅ ‡¶∞‡ßç‡¶ø‡¶ú‡¶ï‡ßç‡¶∑‡¶§ ‡¶π‡¶ï‡¶≤‡¶ì ‡¶§‡¶æ‡¶ø ‡¶∞‡ßç‡¶∏‡¶ø‡¶æ‡¶®‡ßç‡¶§ ‡¶Ø‡¶®‡¶ì ‡¶æ‡¶ø ‡¶ï‡ßç‡¶∑‡¶Æ‡¶§‡¶æ ‡¶∞‡ßç‡¶ø‡¶≤ ‡¶®‡¶æ‡•§ ‡¶ö‡¶æ‡¶ö‡¶æ ‡¶§‡¶æ‡¶ø ‡¶∞‡ßç‡¶¨‡ßç‡¶ï ‡¶ø ‡¶â‡¶ï‡¶¶‡¶Ø‡¶æ‡¶ó ‡¶∞‡ßç‡¶®‡¶ï‡¶≤‡¶ì ‡¶Ø‡ßá‡ßå‡¶§‡ßÅ‡¶ï ‡¶

Extracted text is severly corrupted and PyMUDF library fails to recognize the texts correctly. So, better approach is using ocr and extract using Pytesseract

In [238]:
# Import libraries
from pdf2image import convert_from_path
import cv2
import numpy as np
import re
import pandas as pd
from IPython.display import display
import pytesseract

In [239]:
TESS_LANG = "ben"
DPI = 300

In [240]:
# OCR-based text extraction functions
def preprocess_image_for_ocr(image):
    """Preprocess image for better OCR results"""
    # Convert PIL image to numpy array
    img = np.array(image)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Apply slight Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (1, 1), 0)
    
    # Increase contrast
    contrast = cv2.convertScaleAbs(blurred, alpha=1.2, beta=10)
    
    return contrast


In [241]:
def extract_text_tesseract(image: Image.Image) -> str:
    """Extract Bangla text using Tesseract"""
    return pytesseract.image_to_string(image, lang=TESS_LANG)

In [242]:
def clean_bangla_text(text: str) -> str:
    """Clean Bangla OCR output"""
    text = re.sub(r'[^\u0980-\u09FF\s‡•§,!?]', '', text)  # Keep Bangla and punctuation
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [243]:
try:
    # Convert PDF pages to images
    images = convert_from_path(PDF_PATH, dpi=DPI) 
    print(f"Converted {len(images)} pages to images")
except Exception as e:
    print(f"Error converting PDF: {e}")
    raise

Converted 49 pages to images


In [244]:
# OCR all pages
pages_data = []
for i, image in enumerate(images):
    print(f"üîç Processing page {i+1}/{len(images)}...", end=" ")
    try:
        raw_text = extract_text_tesseract(image)
        cleaned_text = clean_bangla_text(raw_text)
        if cleaned_text:
            pages_data.append({
                'page_number': i + 1,
                'text': cleaned_text,
                'word_count': len(cleaned_text.split()),
                'raw_text': raw_text[:300] 
            })
            print(f"‚úÖ ({len(cleaned_text.split())} words)")
        else:
            print("‚ö†Ô∏è No text found")
    except Exception as e:
        print(f"‚ùå Error: {e}")

print(f"\n‚úÖ Finished OCR on {len(pages_data)} pages")

üîç Processing page 1/49... ‚úÖ (13 words)
üîç Processing page 2/49... ‚úÖ (214 words)
üîç Processing page 3/49... ‚úÖ (163 words)
üîç Processing page 4/49... ‚úÖ (145 words)
üîç Processing page 5/49... ‚úÖ (42 words)
üîç Processing page 6/49... ‚úÖ (318 words)
üîç Processing page 7/49... ‚úÖ (294 words)
üîç Processing page 8/49... ‚úÖ (404 words)
üîç Processing page 9/49... ‚úÖ (264 words)
üîç Processing page 10/49... ‚úÖ (211 words)
üîç Processing page 11/49... ‚úÖ (225 words)
üîç Processing page 12/49... ‚úÖ (372 words)
üîç Processing page 13/49... ‚úÖ (291 words)
üîç Processing page 14/49... ‚úÖ (304 words)
üîç Processing page 15/49... ‚úÖ (347 words)
üîç Processing page 16/49... ‚úÖ (148 words)
üîç Processing page 17/49... ‚úÖ (207 words)
üîç Processing page 18/49... ‚úÖ (219 words)
üîç Processing page 19/49... ‚úÖ (229 words)
üîç Processing page 20/49... ‚úÖ (195 words)
üîç Processing page 21/49... ‚úÖ (326 words)
üîç Processing page 22/49... ‚úÖ (243 words)

In [245]:
# Summary Stats
total_words = sum(page['word_count'] for page in pages_data)
avg_words = total_words / len(pages_data) if pages_data else 0
print(f"\n Summary:")
print(f"   Pages processed: {len(pages_data)}")
print(f"   Total words: {total_words}")
print(f"   Average words per page: {avg_words:.1f}")


 Summary:
   Pages processed: 49
   Total words: 11908
   Average words per page: 243.0


In [246]:
# Display sample page
if pages_data:
    sample = pages_data[0]
    print(f"\n Sample Page {sample['page_number']}:")
    print(f"Raw OCR: {sample['raw_text'][:200]}")
    print(f"Cleaned: {sample['text'][:200]}...")



 Sample Page 1:
Raw OCR: 10940759

‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‚Äù

‡¶π‡¶ø
‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ * ‡¶á‡¶Ç‡¶∞‡ßá‡¶ú‡¶ø * ‡¶Ü‡¶á‡¶∏‡¶ø‡¶ü‡¶ø

‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ú‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶∏‡¶æ‡¶Ø‡¶º,

‡¶ï‡¶≤‡¶ï‡¶∞‡ßã ‡ß¨‡ß¨ 76919

Cleaned: ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü ‡¶π‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶á‡¶Ç‡¶∞‡ßá‡¶ú‡¶ø ‡¶Ü‡¶á‡¶∏‡¶ø‡¶ü‡¶ø ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ú‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶∏‡¶æ‡¶Ø‡¶º, ‡¶ï‡¶≤‡¶ï‡¶∞‡ßã ‡ß¨‡ß¨...


In [247]:
# Create DataFrame for analysis
df = pd.DataFrame([{
    'page_number': p['page_number'],
    'word_count': p['word_count'],
    'sample_text': p['text'][:100] + "..." if len(p['text']) > 100 else p['text']
} for p in pages_data])

In [248]:
# Display table
print("\n First 10 Pages:")
display(df.head(10))


 First 10 Pages:


Unnamed: 0,page_number,word_count,sample_text
0,1,13,‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü ‡¶π‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶á‡¶Ç‡¶∞‡ßá‡¶ú‡¶ø ‡¶Ü‡¶á‡¶∏‡¶ø‡¶ü‡¶ø ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø...
1,2,214,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶®‡¶ø‡¶Æ‡ßç‡¶®‡¶¨‡¶ø‡¶§‡ßç‡¶§ ‡¶¨‡ßç‡¶Ø‡¶ï‡ßç‡¶§‡¶ø‡¶∞ ‡¶π‡¶†‡¶æ‡ßé ...
2,3,163,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶ó‡¶≤‡ßç‡¶™‡ßá‡¶∞ ‡¶ï‡¶•‡¶ï ‡¶ö‡¶∞‡¶ø‡¶§‡ßç‡¶∞ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞...
3,4,145,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶¨‡¶ø‡¶ß‡¶æ‡¶®‡¶ï‡¶∞‡ßç‡¶§‡¶æ ‡¶¨‡¶æ ‡¶∂‡¶æ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡¶™‡ßç‡¶∞‡¶£...
4,5,42,‡¶≤‡¶® ‡ßØ ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡¶Æ‡¶æ‡¶ü‡¶ø‡¶∞ ‡¶ñ‡ßã‡¶≤‡ßá‡¶∞ ‡¶¶‡ßÅ‡¶™‡¶æ‡¶∂‡ßá ‡¶ö‡¶æ‡¶Æ‡¶°‡¶º‡¶æ ‡¶≤‡¶æ...
5,6,318,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶Æ‡ßÇ‡¶≤ ‡¶Ü‡¶≤‡ßã‡¶ö‡ßç‡¶Ø ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶Æ‡ßÇ‡¶≤ ‡¶ó‡¶≤‡ßç‡¶™...
6,7,294,‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶π‡¶∞‡¶ø‡¶∂ ‡¶ï‡¶æ‡¶®‡¶™‡ßÅ‡¶∞‡ßá ‡¶ï‡¶æ‡¶ú ‡¶ï‡¶∞‡ßá‡•§ ‡¶∏‡ßá ‡¶õ‡ßÅ‡¶ü‡¶ø‡¶§‡ßá ‡¶ï‡¶≤‡¶ø‡¶ï‡¶æ‡¶§‡¶æ‡¶Ø‡¶º...
7,8,404,‡¶Æ‡¶®‡ßç‡¶¶ ‡¶®‡¶Ø‡¶º ‡¶π‡ßá! ‡¶ñ‡¶æ‡¶ü‡¶ø ‡¶∏‡ßã‡¶®‡¶æ ‡¶¨‡¶ü‡ßá! ‡¶¨‡¶ø‡¶®‡ßÅ‡¶¶‡¶æ‡¶¶‡¶æ‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶ü‡¶æ ‡¶Ö...
8,9,264,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶Æ‡¶æ‡¶Æ‡¶æ ‡¶¨‡¶ø‡¶¨‡¶æ‡¶π‡¶¨‡¶æ‡¶°‡¶º‡¶ø‡¶§‡ßá ‡¶¢‡ßÅ‡¶ï‡¶ø‡¶Ø‡¶º‡¶æ...
9,10,211,", ‡¶è‡¶á ‡¶¨‡¶≤‡¶ø‡¶Ø‡¶º‡¶æ ‡¶Ø‡ßá ‡¶Æ‡¶ï‡¶∞‡¶Æ‡ßÅ‡¶ñ‡¶æ ‡¶Æ‡ßã‡¶ü‡¶æ ‡¶è‡¶ï‡¶ñ‡¶æ‡¶®‡¶æ ‡¶¨‡¶æ‡¶≤‡¶æ‡¶Ø‡¶º ‡¶è‡¶ï‡¶ü‡ßÅ..."


In [249]:
# Language Classification
bangla_pages = 0
english_pages = 0
mixed_pages = 0

for page in pages_data:
    text = page['text']
    bangla_chars = len(re.findall(r'[‡¶Ö-‡¶î‡¶ã‡¶å‡¶è‡¶ê‡¶ì‡¶î‡¶ï-‡¶π‡ßé‡ßú‡ßù‡ßü‡ß¶-‡ßØ]', text))
    english_chars = len(re.findall(r'[a-zA-Z]', text))
    total_chars = len(text)

    if total_chars > 0:
        bangla_ratio = bangla_chars / total_chars
        english_ratio = english_chars / total_chars
        if bangla_ratio > 0.6:
            bangla_pages += 1
        elif english_ratio > 0.6:
            english_pages += 1
        else:
            mixed_pages += 1

print(f"\n Language Classification:")
print(f"   Primarily Bangla pages: {bangla_pages}")
print(f"   Primarily English pages: {english_pages}")
print(f"   Mixed content pages: {mixed_pages}")


 Language Classification:
   Primarily Bangla pages: 0
   Primarily English pages: 0
   Mixed content pages: 49


In [250]:
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm
import os 
import faiss 


In [301]:
from dotenv import load_dotenv

load_dotenv()

# Access the variables
hf_token = os.getenv("HF_TOKEN")
cohere_token = os.getenv("cohere_api_key")

print("HuggingFace Token:", hf_token[:5] + "..." if hf_token else "Not found")
print("Cohere Token:", cohere_token[:5] + "..." if cohere_token else "Not found")


HuggingFace Token: hf_al...
Cohere Token: d2R8c...


In [252]:
chunks = []
for page in pages_data:
    words = page['text'].split()
    for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
        chunk_words = words[i : i + CHUNK_SIZE]
        chunk_text = " ".join(chunk_words)
        chunks.append({
            'text': chunk_text,
            'page_number': page['page_number']
        })

In [253]:
len(chunks)

87

In [254]:

co = cohere.Client(api_key=cohere_token)
# Choose the multilingual model best for Bangla + English
EMBED_MODEL = "embed-multilingual-v3.0"

In [255]:
def embed_texts(texts, input_type="search_document"):
    response = co.embed(texts=texts, model=EMBED_MODEL, input_type=input_type)
    return np.array(response.embeddings) 

In [256]:
# Embedding chunk texts
chunk_texts = [c['text'] for c in chunks]
chunk_embeddings = embed_texts(chunk_texts, input_type="search_document")


In [257]:
print(f"Embedded {len(chunk_texts)} chunks with shape: {chunk_embeddings.shape}")

Embedded 87 chunks with shape: (87, 1024)


In [258]:
chunk_embeds = np.array(chunk_embeddings) 
print(chunk_embeds.shape)

(87, 1024)


In [259]:
dim = chunk_embeds.shape[1]
index = faiss.IndexFlatL2(dim) 
print(index.is_trained)
index.add(np.float32(chunk_embeds))  # Ensure embeddings are float32

True


In [260]:
def search(query, top_k=5):
    query_emb = embed_texts([query], input_type="search_document")[0]
    D, I = index.search(np.float32([query_emb]), top_k)
    texts_np = np.array(chunk_texts)
    results = pd.DataFrame(data ={
        'texts' : texts_np[I[0]],
        'distances': D[0]
        })
    print(f"Search results for query '{query}':")
    return results

In [261]:
query = "‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?"
results = search(query, top_k=5)
results 

Search results for query '‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?':


Unnamed: 0,texts,distances
0,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß©‡ß¶‡•§ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶§‡ßã ‡¶Ö‡¶ï‡ßç‡¶∑‡¶Æ ‡¶¶‡ßÅ‡¶®‡¶ø‡¶Ø‡¶º...,0.87447
1,‡¶ú‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶∏‡¶æ ‡¶ï‡¶∞‡¶≤? ‡¶ï ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ ‡¶ñ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶Æ‡¶æ ‡¶ó ‡¶ú‡ßá‡¶®‡¶æ‡¶∞‡ßá‡¶≤ ‡¶ò...,0.874944
2,‡¶≤‡ßÅ‡¶≤ ‡¶Ü‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶∏‡ßÉ‡¶ú‡¶®‡¶∂‡ßÄ‡¶≤ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® ‡ßß ‡¶ï‡¶®...,0.890538
3,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß¨‡ßØ‡•§ ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶™‡¶û‡ßç‡¶ö‡¶∂‡¶∞‡ßá‡¶∞ ‡¶¨‡¶ø...,0.895162
4,‡¶¶‡ßá‡¶Ø‡¶º‡•§ ‡¶ö‡ßã‡¶ñ‡ßá‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶®‡ßá ‡¶Ö‡¶®‡ßç‡¶Ø‡¶æ‡¶Ø‡¶º ‡¶¶‡ßá‡¶ñ‡¶≤‡ßá ‡¶á‡¶ö‡ßç‡¶õ‡¶æ ‡¶•‡¶æ‡¶ï‡¶æ ‡¶∏‡¶§‡ßç...,0.915209


In [262]:
query = "‡¶ï‡¶æ‡¶ï‡ßá ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶ó‡ßç‡¶Ø ‡¶¶‡ßá‡¶¨‡¶§‡¶æ ‡¶¨‡¶≤‡ßá ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?"
results = search(query, top_k=5)
results 

Search results for query '‡¶ï‡¶æ‡¶ï‡ßá ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶ó‡ßç‡¶Ø ‡¶¶‡ßá‡¶¨‡¶§‡¶æ ‡¶¨‡¶≤‡ßá ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?':


Unnamed: 0,texts,distances
0,‡¶Ü‡¶®‡¶≤ ‡¶™‡¶æ‡¶†‡ßç‡¶Ø‡¶™‡ßÅ‡¶∏‡ßç‡¶§‡¶ï‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® ‡¶¨‡¶π‡ßÅ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶®‡ßÄ ‡ßß‡•§ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ...,0.70699
1,‡¶Ø‡ßå‡¶§‡ßÅ‡¶ï‡¶≤‡ßã‡¶≠‡ßÄ ‡¶ö‡¶∞‡¶ø‡¶§‡ßç‡¶∞‡•§ ‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶è‡¶ï‡¶ü...,0.881034
2,‡¶≤‡¶® ‡ßØ ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ß®‡ß™‡•§ ‡¶Ö‡¶™‡¶∞‡¶ø‡¶ö‡¶ø‡¶§‡¶æ ‡¶ó‡¶≤‡ßç‡¶™‡ßá ‡¶ï‡ßã‡¶® ‡¶¶‡ßç‡¶¨‡ßÄ‡¶™...,0.889718
3,‡¶ï ‡¶™‡¶£‡ßá‡¶∞ ‡¶Ö‡¶ô‡ßç‡¶ï ‡¶∏‡¶æ‡¶Æ‡¶æ‡¶®‡ßç‡¶Ø ‡¶¨‡¶≤‡ßá ‡¶ñ ‡¶Æ‡ßá‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ ‡¶ï‡¶Æ ‡¶¨‡¶≤‡ßá...,0.906082
4,‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡¶ó‡¶≤‡ßç‡¶™‡ßá‡¶∞ ‡¶ï‡¶•‡¶ï ‡¶ö‡¶∞‡¶ø‡¶§‡ßç‡¶∞ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞...,0.918622


In [263]:
query = "‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?"
results = search(query, top_k=5)
results 

Search results for query '‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?':


Unnamed: 0,texts,distances
0,‡¶¨‡¶ø‡¶®‡ßÅ‡¶¶‡¶æ‡¶∞ ‡¶ó ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡ß´‡ß©‡•§ ‡¶è‡¶ï‡¶¨‡¶æ‡¶∞ ‡¶Æ‡¶æ‡¶Æ‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶õ‡ßá ‡¶ï‡¶•‡¶æ‡¶ü‡¶æ ‡¶™...,0.493609
1,‡¶≤‡¶® ‡ßØ ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ß©‡ßØ‡•§ ‡¶è‡¶ñ‡¶æ‡¶®‡ßá ‡¶ú‡¶æ‡¶Ø‡¶º‡¶ó‡¶æ ‡¶Ü‡¶õ‡ßá ‡¶â‡¶ï‡ßç‡¶§‡¶ø‡¶ü‡¶ø...,0.712034
2,‡¶ì ‡ß´ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶¨‡¶õ‡¶∞? ‡¶ï ‡¶™‡¶Å‡¶ö‡¶ø‡¶∂ ‡¶ñ ‡¶õ‡¶æ‡¶¨‡¶ø‡¶¨‡¶ø‡¶∂ ‡¶ó ‡¶∏...,0.816628
3,‡¶Æ‡ßá‡¶Ø‡¶º‡ßá‡¶∞ ‡¶ú‡ßÄ‡¶¨‡¶® ‡¶¨‡¶æ ‡¶≠‡¶¨‡¶ø‡¶∑‡ßç‡¶Ø‡ßé ‡¶∂‡¶ô‡ßç‡¶ï‡¶æ‡¶Æ‡ßÅ‡¶ï‡ßç‡¶§ ‡¶∞‡¶æ‡¶ñ‡¶æ‡¶∞ ‡¶®‡¶ø‡¶Æ‡¶ø‡¶§‡ßç...,0.845232
4,‡¶Æ‡¶®‡ßç‡¶¶ ‡¶®‡¶Ø‡¶º ‡¶π‡ßá! ‡¶ñ‡¶æ‡¶ü‡¶ø ‡¶∏‡ßã‡¶®‡¶æ ‡¶¨‡¶ü‡ßá! ‡¶¨‡¶ø‡¶®‡ßÅ‡¶¶‡¶æ‡¶¶‡¶æ‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶ü‡¶æ ‡¶Ö...,0.860326


In [264]:
from rank_bm25 import BM25Okapi 
from sklearn.feature_extraction import _stop_words
import string 

In [265]:
def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.split():
        token = token.strip(string.punctuation)
        if token and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token.lower())
    return tokenized_doc

In [266]:
tokenized_corpus = []
for passage in tqdm(chunk_texts, desc="Tokenizing corpus"):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

Tokenizing corpus: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [00:00<00:00, 24281.64it/s]


In [267]:
def keyword_search(query, top_k=5,num_candidates=15):
    print("Input Query:", query)
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{ 'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    print(f"Top {top_k} results for query '{query}':")
    for hit in bm25_hits[:top_k]:
        print(f"  - ID: {hit['corpus_id']}, Score: {hit['score']:.4f}, Text: {chunk_texts[hit['corpus_id']][:100]}...")

In [268]:
keyword_search("‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?", top_k=5, num_candidates=15)
keyword_search("‡¶ï‡¶æ‡¶ï‡ßá ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶ó‡ßç‡¶Ø ‡¶¶‡ßá‡¶¨‡¶§‡¶æ ‡¶¨‡¶≤‡ßá ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?", top_k=5, num_candidates=15)
keyword_search("‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?", top_k=5, num_candidates=15)

Input Query: ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?
Top 5 results for query '‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?':
  - ID: 61, Score: 5.7928, Text: ‡¶ó‡¶∞‡ßÅ‡¶∞ ‡¶ó‡¶æ‡¶°‡¶º‡¶ø ‡¶ó ‡¶Æ‡ßã‡¶ü‡¶∞ ‡¶ó‡¶æ‡¶°‡¶º‡¶ø ‡ßØ ‡¶ò ‡¶ò‡ßã‡¶°‡¶º‡¶æ‡¶∞ ‡¶ó‡¶æ‡¶°‡¶º‡¶ø ‡ß©‡ßÆ‡•§ ‡¶Ö‡¶®‡ßç‡¶®‡¶™‡ßÇ‡¶∞‡ßç‡¶£‡¶æ‡¶∞ ‡¶ï‡ßã‡¶≤‡ßá ‡¶ó‡¶ú‡¶æ‡¶®‡¶®‡ßá‡¶∞ ‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø ‡¶è‡¶ñ‡¶æ‡¶®‡ßá ‡¶õ‡ßã‡¶ü ‡¶≠‡¶æ‡¶á‡¶ü‡¶ø ‡¶¨‡¶≤‡¶§‡ßá...
  - ID: 60, Score: 4.9513, Text: ‡¶≤‡¶® ‡ßØ ‡¶Ö‡¶®‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ß®‡ß™‡•§ ‡¶Ö‡¶™‡¶∞‡¶ø‡¶ö‡¶ø‡¶§‡¶æ ‡¶ó‡¶≤‡ßç‡¶™‡ßá ‡¶ï‡ßã‡¶® ‡¶¶‡ßç‡¶¨‡ßÄ‡¶™‡ßá‡¶∞ ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶Ü‡¶õ‡ßá? ‡¶ï ‡¶Ü‡¶®‡ßç‡¶¶‡¶æ‡¶Æ‡¶æ‡¶® ‡¶¶‡ßç‡¶¨‡ßÄ‡¶™ ‡¶ñ ‡¶π‡¶æ‡¶á‡¶ï‡ßÅ ‡¶¶‡ßç‡¶¨‡ßÄ‡¶™ ‡¶ó ‡¶ï‡ßç‡¶Ø‡¶æ‡¶∞‡¶ø...
  - ID: 66, Score: 4.7661, Text: ‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß¨‡ßØ‡•§ ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶™‡¶û‡ßç‡¶ö‡¶∂‡¶∞‡ßá‡¶∞ ‡

In [269]:
query = "‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?"
results = co.rerank(query=query, documents=chunk_texts, top_n=3, return_documents=True)
print(f"Rerank results for query '{query}':")
for idx, result in enumerate(results.results):
    print(idx, result.relevance_score, result.document.text)

Rerank results for query '‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?':
0 0.71185225 ‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß¨‡ßØ‡•§ ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶™‡¶û‡ßç‡¶ö‡¶∂‡¶∞‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß ‡¶®‡ßá‡¶á ‡¶¨‡¶≤‡ßá ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶Æ‡¶®‡ßá ‡¶π‡¶≤‡ßã? ‡¶ï ‡¶ó‡¶ú‡¶æ‡¶®‡¶®‡ßá‡¶∞ ‡¶ñ ‡¶ï‡¶æ‡¶∞‡ßç‡¶§‡¶ø‡¶ï‡ßá‡¶∞ ‡¶ó ‡¶™‡ßç‡¶∞‡¶ú‡¶æ‡¶™‡¶§‡¶ø‡¶∞ ‡¶ò ‡¶Ö‡¶®‡ßç‡¶®‡¶™‡ßÇ‡¶∞‡ßç‡¶£‡¶æ ‡ß≠‡ß¶‡•§ ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶¨‡¶ü‡ßá ‡¶ï‡ßá? ‡¶ï ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ ‡¶ñ ‡¶π‡¶∞‡¶ø‡¶∂ ‡¶ó ‡¶Æ‡¶æ‡¶Æ‡¶æ ‡¶ò ‡¶∂‡¶∏‡ßç‡¶§‡ßÅ‡¶®‡¶æ‡¶• ‡ß≠‡ßß‡•§ ‡¶ö‡ßÅ‡¶≤ ‡¶ï‡¶æ‡¶ö‡¶æ ‡¶ó‡ßã‡¶Å‡¶´ ‡¶™‡¶æ‡¶ï ‡¶ß‡¶∞‡ßá‡¶õ‡ßá ‡¶ï‡¶æ‡¶∞? ‡¶ï ‡¶Æ‡¶æ‡¶Æ‡¶æ‡¶∞ ‡¶ñ ‡¶∂‡¶∏‡ßç‡¶§‡ßÅ‡¶®‡¶æ‡¶•‡ßá‡¶∞ ‡¶ó ‡¶¨‡¶ø‡¶®‡ßÅ‡¶¶‡¶æ‡¶¶‡¶æ‡¶∞ ‡¶ò ‡¶π‡¶∞‡¶ø‡¶∂‡ßá‡¶∞ ‡ß≠‡ß®‡•§ ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ ‡¶ï‡ßã‡¶® ‡¶∏‡ßç‡¶ü‡ßá‡¶∂‡¶® ‡¶®‡ßá‡¶Æ‡ßá ‡¶ó‡ßá‡¶≤? ‡¶ï ‡¶ï‡ßã‡¶®‡ßç‡¶®‡¶ó‡¶∞ ‡¶ñ ‡¶ï‡¶≤‡¶ø‡¶ï‡¶æ‡¶§‡¶æ ‡¶ó ‡¶ï‡¶æ‡¶®‡¶™‡ßÅ‡¶∞ ‡¶ò ‡¶π‡¶æ‡¶ì‡¶°‡¶º‡¶æ ‡ß≠‡ß©‡•§ ‡¶õ‡ßã‡¶ü‡¶¨‡ßá‡

In [270]:
def keyword_and_rerank_search(query, top_k=5, num_candidates=15):
    print("Input Query:", query)

    # Step 1: Keyword Search using BM25
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{
        'corpus_id': idx,
        'score': bm25_scores[idx],
        'text': chunk_texts[idx]
    } for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    candidate_texts = [hit['text'] for hit in bm25_hits]

    # Step 2: Reranking using Cohere
    results = co.rerank(query=query, documents=candidate_texts, top_n=top_k, return_documents=True)

    # Step 3: Print results
    print(f"\nTop {top_k} reranked results for query '{query}':")
    for idx, result in enumerate(results.results):
        print(f"{idx + 1}. Relevance Score: {result.relevance_score:.4f}")
        print(f"   Text: {result.document.text[:150]}...\n")

    return [(result.document.text, result.relevance_score) for result in results.results]


In [271]:
query = "‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?"
top_docs = keyword_and_rerank_search(query, top_k=3, num_candidates=15)


Input Query: ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?

Top 3 reranked results for query '‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?':
1. Relevance Score: 0.7119
   Text: ‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß¨‡ßØ‡•§ ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶™‡¶û‡ßç‡¶ö‡¶∂‡¶∞‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß ‡¶®‡ßá‡¶á ‡¶¨‡¶≤‡ßá ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶Æ‡¶®‡ßá ‡¶π‡¶≤‡ßã? ‡¶ï ‡¶ó‡¶ú‡¶æ‡¶®‡¶®‡ßá‡¶∞ ‡¶ñ ‡¶ï‡¶æ‡¶∞‡ßç‡¶§‡¶ø‡¶ï‡ßá‡¶∞ ‡¶ó ‡¶™‡ßç‡¶∞‡¶ú‡¶æ‡¶™‡¶§‡¶ø‡¶∞ ‡¶ò ‡¶Ö‡¶®‡ßç‡¶®‡¶™‡ßÇ‡¶∞‡ßç‡¶£‡¶æ ‡ß≠‡ß¶‡•§ ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶¨‡¶ü‡ßá ‡¶ï‡ßá? ‡¶ï ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ ...

2. Relevance Score: 0.5646
   Text: ‡¶≤‡ßÅ‡¶≤ ‡¶ú‡¶Ü‡¶≤‡¶≤‡¶æ‡¶á‡¶® ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ö ‡ßß? ‡ß©‡ß¶‡•§ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶§‡ßã ‡¶Ö‡¶ï‡ßç‡¶∑‡¶Æ ‡¶¶‡ßÅ‡¶®‡¶ø‡¶Ø‡¶º‡¶æ‡¶Ø‡¶º ‡¶®‡¶æ‡¶á‡•§ ‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶è‡¶á ‡¶â‡¶ï‡ßç‡¶§‡¶ø‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶ï‡ßÄ ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∂ ‡¶™‡ßá‡¶Ø‡¶º‡ßá‡¶õ‡ßá? ‡¶ï‡

In [299]:
from langchain.vectorstores import FAISS
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.chat_models import ChatCohere
from langchain.schema import HumanMessage
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain.schema import Document

In [289]:
# Use MPS device for HuggingFace embeddings (Apple Silicon GPU)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

device = "mps"  # Apple Silicon GPU

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": device}
)

docs = [Document(page_content=chunk['text']) for chunk in chunks]
faiss_store = FAISS.from_documents(docs, embedding=embedding_model)
faiss_store.save_local("../faiss_index")
print("FAISS index saved to 'faiss_index/'")

FAISS index saved to 'faiss_index/'


In [290]:
#Faiss load local
faiss_store = FAISS.load_local(
    "../faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)


In [318]:
template = """‡¶™‡ßç‡¶∞‡¶∏‡¶ô‡ßç‡¶ó:
‡¶®‡¶ø‡¶ö‡ßá ‡¶è‡¶ï‡¶ü‡¶ø ‡¶Ö‡¶ú‡¶æ‡¶®‡¶æ ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞ (context) ‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡¶® ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§ ‡¶è‡¶ü‡¶ø ‡¶è‡¶ï‡¶ü‡¶ø ‡¶ó‡¶≤‡ßç‡¶™, ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßã‡¶§‡ßç‡¶§‡¶∞, ‡¶¨‡¶π‡ßÅ ‡¶®‡¶ø‡¶∞‡ßç‡¶¨‡¶æ‡¶ö‡¶®‡ßÄ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®, ‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶ú‡ßç‡¶û‡¶æ‡¶®, ‡¶Ü‡¶≤‡ßã‡¶ö‡¶®‡¶æ ‡¶Ö‡¶•‡¶¨‡¶æ ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶≤‡ßá‡¶ñ‡¶æ ‡¶π‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡•§ 
‡¶è‡¶á ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶≠‡¶ø‡¶§‡¶∞‡ßá ‡¶¨‡¶æ ‡¶∂‡ßá‡¶∑‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶¨‡¶æ ‡¶™‡¶∞‡ßã‡¶ï‡ßç‡¶∑‡¶≠‡¶æ‡¶¨‡ßá ‡¶â‡¶™‡¶∏‡ßç‡¶•‡¶ø‡¶§ ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá ‚Äî ‡¶Ø‡ßá‡¶Æ‡¶® ‡¶∂‡ßá‡¶∑‡ßá ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶§‡¶æ‡¶≤‡¶ø‡¶ï‡¶æ‡¶¨‡¶¶‡ßç‡¶ß ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá (‡¶Ø‡ßá‡¶Æ‡¶®: "Correct Answers", "Answer Key" ‡¶á‡¶§‡ßç‡¶Ø‡¶æ‡¶¶‡¶ø), ‡¶Ö‡¶•‡¶¨‡¶æ ‡¶â‡¶§‡ßç‡¶§‡¶∞‡¶ó‡ßÅ‡¶≤‡ßã ‡¶≤‡ßá‡¶ñ‡¶æ‡¶∞ ‡¶≠‡¶ø‡¶§‡¶∞‡ßá ‡¶õ‡¶°‡¶º‡¶ø‡¶Ø‡¶º‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡•§

‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ú:
- ‡¶™‡ßÅ‡¶∞‡ßã ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞ (context) ‡¶Æ‡¶®‡ßã‡¶Ø‡ßã‡¶ó ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶™‡¶°‡¶º‡ßÅ‡¶®‡•§
- ‡¶™‡ßç‡¶∞‡¶¶‡¶§‡ßç‡¶§ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶ñ‡ßÅ‡¶Å‡¶ú‡ßá ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡ßÅ‡¶®‡•§
- ‡¶Ø‡¶¶‡¶ø ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞ context-‡¶è‡¶∞ ‡¶∂‡ßá‡¶∑‡ßá ‡¶Ü‡¶≤‡¶æ‡¶¶‡¶æ‡¶≠‡¶æ‡¶¨‡ßá ‡¶§‡¶æ‡¶≤‡¶ø‡¶ï‡¶æ‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶•‡¶æ‡¶ï‡ßá, ‡¶§‡¶æ‡¶π‡¶≤‡ßá ‡¶∏‡ßá‡¶ñ‡¶æ‡¶® ‡¶•‡ßá‡¶ï‡ßá‡¶ì ‡¶Æ‡¶ø‡¶≤‡¶ø‡¶Ø‡¶º‡ßá ‡¶¶‡ßá‡¶ñ‡ßÅ‡¶®‡•§
- ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞‡¶ü‡¶ø ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§‡¶≠‡¶æ‡¶¨‡ßá ‡¶è‡¶¨‡¶Ç ‡¶∏‡ßç‡¶™‡¶∑‡ßç‡¶ü‡¶≠‡¶æ‡¶¨‡ßá ‡¶è‡¶ï ‡¶≤‡¶æ‡¶á‡¶®‡ßá ‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡¶® ‡¶ï‡¶∞‡ßÅ‡¶®‡•§

‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶≤‡ßá‡¶ñ‡¶æ‡¶∞ ‡¶®‡¶ø‡¶Ø‡¶º‡¶Æ:
- ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶Ö‡¶¨‡¶∂‡ßç‡¶Ø‡¶á ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶â‡¶™‡¶∞ ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø ‡¶ï‡¶∞‡ßá ‡¶π‡¶§‡ßá ‡¶π‡¶¨‡ßá‡•§
- ‡¶Ö‡¶™‡ßç‡¶∞‡¶æ‡¶∏‡¶ô‡ßç‡¶ó‡¶ø‡¶ï ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ ‡¶¨‡¶æ ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡¶¨‡ßá‡¶® ‡¶®‡¶æ‡•§
- ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® ‡¶Ø‡ßá‡¶≠‡¶æ‡¶¨‡ßá‡¶á ‡¶π‡ßã‡¶ï, ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡ßá‡¶¨‡¶≤‡¶Æ‡¶æ‡¶§‡ßç‡¶∞ ‡¶∏‡¶Ç‡¶∂‡ßç‡¶≤‡¶ø‡¶∑‡ßç‡¶ü ‡¶ì ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞‡¶ü‡¶ø ‡¶è‡¶ï ‡¶≤‡¶æ‡¶á‡¶®‡ßá ‡¶¶‡¶ø‡¶®‡•§

{context}

‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®: {question}
‡¶â‡¶§‡ßç‡¶§‡¶∞: ‡¶è‡¶ï ‡¶≤‡¶æ‡¶á‡¶®‡ßá ‡¶∏‡¶†‡¶ø‡¶ï ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶®‡•§"""
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)


In [319]:
memory = ConversationBufferWindowMemory(k=2, return_messages=True)

In [329]:
openai_api_key = os.getenv("OPENAI_API_KEY")
llm_openai = ChatOpenAI(
    openai_api_key=openai_api_key,
    model="gpt-4o",       
    streaming=True       
)

In [328]:
def rag_chat_with_memory(query: str, llm=llm):
    # Save user question into memory
    memory.chat_memory.add_user_message(query)

    # Step 1: FAISS retrieval of top-10 candidate docs
    retriever = faiss_store.as_retriever(search_kwargs={"k": 10})
    candidate_docs = retriever.get_relevant_documents(query)
    
    # Extract texts and indices from FAISS results
    candidate_texts = [doc.page_content for doc in candidate_docs]
    
    # Step 2: Apply BM25 on the FAISS-retrieved documents
    # Tokenize query for BM25
    tokenized_query = bm25_tokenizer(query)
    
    # Create local BM25 index for the candidate texts
    tokenized_candidates = [bm25_tokenizer(text) for text in candidate_texts]
    local_bm25 = BM25Okapi(tokenized_candidates)
    
    # Get BM25 scores for candidate documents
    bm25_scores = local_bm25.get_scores(tokenized_query)
    
    # Combine texts with their BM25 scores
    bm25_results = [{
        'text': text,
        'score': score
    } for text, score in zip(candidate_texts, bm25_scores)]
    
    # Sort by BM25 scores (descending)
    bm25_results = sorted(bm25_results, key=lambda x: x['score'], reverse=True)
    
    # Step 3: Rerank BM25-ordered texts with Cohere to get top-3
    rerank_res = co.rerank(
        query=query,
        documents=[res['text'] for res in bm25_results],
        top_n=3,
        return_documents=True
    )
    
    # Step 4: Extract top reranked texts
    top_texts = [res.document.text for res in rerank_res.results]
    
    # Step 5: Build context from reranked texts
    context = "\n".join(top_texts)

    # Step 6: Generate answer using prompt template
    prompt_str = prompt_template.format(context=context, question=query)
    answer = llm.invoke([HumanMessage(content=prompt_str)]).content.strip()
    
    # Save answer into memory
    memory.chat_memory.add_ai_message(answer)
    
    # Print and return
    print("User:", query)
    print("Answer:", answer)
    return answer

In [330]:
from langchain.chat_models import ChatCohere

llm_command_r = ChatCohere(cohere_api_key=cohere_token, model="command-r", streaming=True)


In [331]:
rag_chat_with_memory(query="‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?", llm=llm_openai) 

User: ‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?
Answer: ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶â‡¶≤‡ßç‡¶≤‡¶ø‡¶ñ‡¶ø‡¶§ ‡¶®‡ßá‡¶á‡•§


'‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶â‡¶≤‡ßç‡¶≤‡¶ø‡¶ñ‡¶ø‡¶§ ‡¶®‡ßá‡¶á‡•§'

In [342]:
rag_chat_with_memory(query="‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?", llm=llm_command_r) 

KeyboardInterrupt: 

In [333]:
memory.chat_memory.messages[-4:]  # Show last 2 messages in memory

[AIMessage(content='‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶®‡ßá‡¶∞ ‡¶â‡¶§‡ßç‡¶§‡¶∞‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶™‡ßç‡¶∞‡¶¶‡¶§‡ßç‡¶§ ‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶ï‡ßã‡¶®‡ßã ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶®‡ßá‡¶á‡•§'),
 HumanMessage(content='‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?'),
 AIMessage(content='‡¶§‡¶•‡ßç‡¶Ø‡¶∏‡ßÇ‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶∏‡¶∞‡¶æ‡¶∏‡¶∞‡¶ø ‡¶â‡¶≤‡ßç‡¶≤‡¶ø‡¶ñ‡¶ø‡¶§ ‡¶®‡ßá‡¶á‡•§'),
 HumanMessage(content='‡¶¨‡¶ø‡¶Ø‡¶º‡ßá‡¶∞ ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡¶≤‡ßç‡¶Ø‡¶æ‡¶£‡ßÄ‡¶∞ ‡¶™‡ßç‡¶∞‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶Ø‡¶º‡¶∏ ‡¶ï‡¶§ ‡¶õ‡¶ø‡¶≤?')]

In [341]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline

# Load Hugging Face model and tokenizer
model_id = "Qwen/Qwen1.5-1.8B-Chat"  
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Create a text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

# Wrap in LangChain-compatible LLM
llm_qwen = HuggingFacePipeline(pipeline=pipe)


{"timestamp":"2025-07-25T18:06:52.101285Z","level":"WARN","fields":{"message":"Reqwest(reqwest::Error { kind: Request, source: hyper_util::client::legacy::Error(SendRequest, hyper::Error(IncompleteMessage)) }). Retrying..."},"filename":"/Users/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":242}
{"timestamp":"2025-07-25T18:06:52.101412Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 1.27884763s before the next attempt"},"filename":"/Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}


Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
rag_chat_with_memory("‡¶Ö‡¶®‡ßÅ‡¶™‡¶Æ‡ßá‡¶∞ ‡¶≠‡¶æ‡¶∑‡¶æ‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶™‡ßÅ‡¶∞‡ßÅ‡¶∑ ‡¶ï‡¶æ‡¶ï‡ßá ‡¶¨‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá?", llm=llm_qwen)