In [1]:
import os

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "duk_data.md"

with open(file_path, "r", encoding = "utf_8") as file:
    markdown_content = file.read()

In [3]:
print(type(markdown_content))
print(len(markdown_content))

<class 'str'>
26641


In [4]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [
    ("#", "Header_1"),
    ("##", "Header_2")
])

chunks = markdown_splitter.split_text(markdown_content)

In [5]:
chunks

[Document(metadata={'Header_1': 'Digital University of Kerala (DUK) Admission 2025', 'Header_2': 'General Information About Digital University Kerala (DUK)'}, page_content='- Digital University Kerala (DUK), formerly known as Indian Institute of Information Technology and Management-Kerala (IIITM-K), was established in 2000 by the Government of Kerala as an autonomous institution.\n- The university offers postgraduate and doctoral programmes with focus areas including Artificial Intelligence, Natural Language Processing, Internet of Things, Electronic Systems and Automation, Imaging Technologies, Data Analytics and Big Data, Cybersecurity, Blockchain, Ecological Informatics, Geospatial Analytics, and Applied Materials.  \n**Vision**\nTo become a globally reputed institution in digital education and research and to nurture future-ready talent capable of developing innovative and sustainable solutions for industry, government, and society.  \n**Mission**\nThe mission of the university is

In [6]:
len(chunks)

29

In [7]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def token_len(text):
    return len(tokenizer.encode(text, add_special_tokens = False))


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 250,
    chunk_overlap = 60,
    length_function = token_len,
    separators = ["\n\n", "\n", ".", " "]
)

final_chunks = text_splitter.split_documents(chunks)

In [8]:
final_chunks

[Document(metadata={'Header_1': 'Digital University of Kerala (DUK) Admission 2025', 'Header_2': 'General Information About Digital University Kerala (DUK)'}, page_content='- Digital University Kerala (DUK), formerly known as Indian Institute of Information Technology and Management-Kerala (IIITM-K), was established in 2000 by the Government of Kerala as an autonomous institution.\n- The university offers postgraduate and doctoral programmes with focus areas including Artificial Intelligence, Natural Language Processing, Internet of Things, Electronic Systems and Automation, Imaging Technologies, Data Analytics and Big Data, Cybersecurity, Blockchain, Ecological Informatics, Geospatial Analytics, and Applied Materials.  \n**Vision**\nTo become a globally reputed institution in digital education and research and to nurture future-ready talent capable of developing innovative and sustainable solutions for industry, government, and society.  \n**Mission**\nThe mission of the university is

In [9]:
len(final_chunks)

43

In [10]:
embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

In [11]:
vector_db = FAISS.from_documents(final_chunks, embeddings)

In [12]:
vector_db.save_local("duk_faiss_index")