In [2]:
import os
from typing import List
from docx import Document as DocxDocument
from langchain.schema import Document
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.vectorstores import FAISS
from vector_store.embedder import get_embedder


# Loaders
def load_txt(path: str) -> List[Document]:
    return TextLoader(path, encoding="utf-8").load()


def load_pdf(path: str) -> List[Document]:
    return PyPDFLoader(path).load()


def load_docx(path: str) -> List[Document]:
    full_text = "\n".join([para.text.strip() for para in DocxDocument(path).paragraphs if para.text.strip()])
    return [Document(page_content=full_text, metadata={"source": path})]


def load_documents(paths: List[str]) -> List[Document]:
    all_docs = []
    for path in paths:
        ext = os.path.splitext(path)[1].lower()
        if ext == ".txt":
            all_docs.extend(load_txt(path))
        elif ext == ".pdf":
            all_docs.extend(load_pdf(path))
        elif ext == ".docx":
            all_docs.extend(load_docx(path))
    return all_docs


def save_vectorstore(docs: List[Document], path: str):
    embedder = get_embedder()
    vectordb = FAISS.from_documents(docs, embedder)
    vectordb.save_local(path)
    print(f"Vectorstore saved to: {path}")


In [3]:
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from vector_store.embedder import get_embedder


def paragraph_split(docs: List[Document], chunk_size=512) -> List[Document]:
    chunks = []
    for doc in docs:
        paras = doc.page_content.split("\n\n")
        buf = ""
        for para in paras:
            if len(buf) + len(para) <= chunk_size:
                buf += para + "\n\n"
            else:
                chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata))
                buf = para + "\n\n"
        if buf:
            chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata))
    return chunks


def semantic_split(docs: List[Document], threshold=0.85) -> List[Document]:
    embedder = get_embedder()
    chunks = []
    for doc in docs:
        paras = doc.page_content.split("\n\n")
        if len(paras) < 2:
            chunks.append(doc)
            continue
        buf = []
        for i in range(len(paras) - 1):
            buf.append(paras[i])
            v1 = embedder.embed_query("passage: " + paras[i])
            v2 = embedder.embed_query("passage: " + paras[i + 1])
            sim = cosine_similarity([v1], [v2])[0][0]
            if sim < threshold:
                chunks.append(Document(page_content="\n\n".join(buf).strip(), metadata=doc.metadata))
                buf = []
        buf.append(paras[-1])
        if buf:
            chunks.append(Document(page_content="\n\n".join(buf).strip(), metadata=doc.metadata))
    return chunks


In [5]:
from vector_store.store import load_documents, save_vectorstore
from splitter.text_splitter import recursive_split
from splitter.splitter_utils import semantic_split, paragraph_split


def build_dual_stores(bn_files, en_files, strategy="semantic"):
    splitter = {
        "semantic": semantic_split,
        "paragraph": paragraph_split,
        "recursive": recursive_split
    }.get(strategy)

    if not splitter:
        raise ValueError(f"Invalid split strategy: {strategy}")

    print("Bangla")
    bn_chunks = splitter(load_documents(bn_files))
    print(bn_chunks)


In [8]:
en_dir = "data/raw/english"
bn_dir = "data/raw/bangla"


# Get all files from directories
en_files = [os.path.join(en_dir, f) for f in os.listdir(en_dir) if os.path.isfile(os.path.join(en_dir, f))]
bn_files = [os.path.join(bn_dir, f) for f in os.listdir(bn_dir) if os.path.isfile(os.path.join(bn_dir, f))]

build_dual_stores(bn_files, en_files, strategy='semantic')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/data/raw/english'

In [9]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\shahabuddin akhon
[nltk_data]     hr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\shahabuddin
[nltk_data]     akhon hr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\shahabuddin akhon
[nltk_data]     hr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
print(stopwords.fileids())  # Lists all available languages

['albanian', 'arabic', 'azerbaijani', 'basque', 'belarusian', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'tamil', 'turkish']
