In [1]:
# Install required packages
%pip install pandas langchain langchain-community langchain-google-genai chromadb pypdf unstructured markdown
# Install additional unstructured dependencies for better markdown processing
%pip install "unstructured[md]"

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import warnings
warnings.filterwarnings('ignore')

# Disable ChromaDB telemetry to avoid warnings
os.environ["ANONYMIZED_TELEMETRY"] = "False"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm


For PDF

In [None]:
# Set your Google API key
import os
os.environ["GOOGLE_API_KEY"] = "*****************************"  # Replace with your actual API key

# Define the path to your PDF files
dataset_dir = "../Data/pdf_data/"
pdf_files = [f for f in os.listdir(dataset_dir) if f.endswith('.pdf')]

# Clean up existing chroma_db directory to avoid readonly database issues
if os.path.exists("./chroma_db"):
    print("Cleaning up existing database directory...")
    shutil.rmtree("./chroma_db")

# Create fresh chroma_db directory
os.makedirs("./chroma_db", exist_ok=True)
print(f"Found {len(pdf_files)} PDF files to process")

Found 12 PDF files to process


In [None]:
# Initialize the Google embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


# Process each PDF file
for i, pdf_file in enumerate(pdf_files, 1):
    try:
        file_path = os.path.join(dataset_dir, pdf_file)
        print(f"Processing {i}/{len(pdf_files)}: {pdf_file}...")

        # Load the PDF file
        loader = PyPDFLoader(file_path=file_path)
        documents = loader.load()
        print(f"  Loaded {len(documents)} pages")
        
        # Split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        chunks = text_splitter.split_documents(documents)
        print(f"  Created {len(chunks)} chunks")
        
        # Create a unique collection name (ChromaDB collection names must be valid)
        collection_name = f"pdf_{i:03d}_{os.path.splitext(pdf_file)[0][:50]}"
        collection_name = "".join(c for c in collection_name if c.isalnum() or c in ['_', '-'])
        
        # Create database directory for this PDF
        db_path = f"./chroma_db/chroma_{os.path.splitext(pdf_file)[0]}"
        os.makedirs(db_path, exist_ok=True)
        
        # Create a vector store with explicit collection name
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=db_path,
            collection_name=collection_name
        )
        
        # Persist the vector store
        vectordb.persist()
        print(f"Created vector store with {len(chunks)} chunks")
        
        # Clean up to free memory
        del vectordb
        
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        continue

print("All PDF files processed and embedded successfully!")

Processing 1/12: Nabila Noshin.pdf...
  Loaded 1 pages
  Created 2 chunks
Created vector store with 2 chunks
Processing 2/12: Mohammad Islam.pdf...
  Loaded 6 pages
  Created 11 chunks
Created vector store with 11 chunks
Processing 3/12: Tanmoy Shome.pdf...
  Loaded 2 pages
  Created 3 chunks
Created vector store with 3 chunks
Processing 4/12: Shanto Jouerder.pdf...
  Loaded 4 pages
  Created 8 chunks
Created vector store with 8 chunks
Processing 5/12: Abir Hasan.pdf...
  Loaded 2 pages
  Created 3 chunks
Created vector store with 3 chunks
Processing 6/12: Al Mamun Khan.pdf...
  Loaded 6 pages
  Created 13 chunks
Created vector store with 13 chunks
Processing 7/12: Abdullah al Mamun.pdf...
  Loaded 1 pages
  Created 1 chunks
Created vector store with 1 chunks
Processing 8/12: Zeshan Haider.pdf...
  Loaded 1 pages
  Created 1 chunks
Created vector store with 1 chunks
Processing 9/12: Lameya Sabrin.pdf...
  Loaded 2 pages
  Created 4 chunks
Created vector store with 4 chunks
Processing 1

For HTML

In [None]:
# Define the path to your HTML files
dataset_dir = "../Data/html_data/"
html_files = [f for f in os.listdir(dataset_dir) if f.endswith('.html')]

# Process each HTML file
for i, html_file in enumerate(html_files, 1):
    try:
        file_path = os.path.join(dataset_dir, html_file)
        print(f"Processing {i}/{len(html_files)}: {html_file}...")

        # Load the HTML file
        loader = UnstructuredHTMLLoader(file_path=file_path)
        documents = loader.load()
        print(f"  Loaded {len(documents)} pages")
        
        # Split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        chunks = text_splitter.split_documents(documents)
        print(f"  Created {len(chunks)} chunks")
        
        # Create a unique collection name (ChromaDB collection names must be valid)
        collection_name = f"html_{i:03d}_{os.path.splitext(html_file)[0][:50]}"
        collection_name = "".join(c for c in collection_name if c.isalnum() or c in ['_', '-'])

        # Create database directory for this HTML
        db_path = f"./chroma_db/chroma_{os.path.splitext(html_file)[0]}"
        os.makedirs(db_path, exist_ok=True)
        
        # Create a vector store with explicit collection name
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=db_path,
            collection_name=collection_name
        )
        
        # Persist the vector store
        vectordb.persist()
        print(f"Created vector store with {len(chunks)} chunks")
        
        # Clean up to free memory
        del vectordb
        
    except Exception as e:
        print(f"Error processing {html_file}: {e}")
        continue

print("All HTML files processed and embedded successfully!")

Processing 1/10: VISIE _ CONTACT.html...
  Loaded 1 pages
  Created 2 chunks
Created vector store with 2 chunks
Processing 2/10: VISIE _ DOCUMIND.html...
  Loaded 1 pages
  Created 6 chunks
Created vector store with 6 chunks
Processing 3/10: VISIE _ AI-INSIGHTS.html...
  Loaded 1 pages
  Created 6 chunks
Created vector store with 6 chunks
Processing 4/10: VISIE _ KOTHOK.html...
  Loaded 1 pages
  Created 6 chunks
Created vector store with 6 chunks
Processing 5/10: VISIE Ltd _ AI Solutions for Innovation.html...
  Loaded 1 pages
  Created 11 chunks
Created vector store with 11 chunks
Processing 6/10: VISIE Ltd _ AI Solutions for Innovation_.html...
  Loaded 1 pages
  Created 2 chunks
Error processing Ferdous Bin Ali.pdf: Validation error: name: Expected a name containing 3-512 characters from [a-zA-Z0-9._-], starting and ending with a character in [a-zA-Z0-9]. Got: html_006_VISIELtd_AISolutionsforInnovation_
Processing 7/10: VISIE _ ABOUT.html...
  Loaded 1 pages
  Created 5 chunks
Crea

For Markdown

In [3]:
# Set your Google API key
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyDCi6ZPPaM_ssh3_5JeSfuL19bFGn1s9VM"  # Replace with your actual API key

# Define the path to your Markdown files
dataset_dir = "../Data/markdown_data/"
markdown_files = [f for f in os.listdir(dataset_dir) if f.endswith('.markdown')]

# Clean up existing chroma_db directory to avoid readonly database issues
if os.path.exists("./chroma_db/visie"):
    print("Cleaning up existing database directory...")
    shutil.rmtree("./chroma_db/visie")

# Create fresh chroma_db directory
os.makedirs("./chroma_db/visie", exist_ok=True)
print(f"Found {len(markdown_files)} Markdown files to process")

Cleaning up existing database directory...
Found 1 Markdown files to process


In [4]:
# Initialize the Google embeddings for markdown files
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


# Process each Markdown file
for i, markdown_file in enumerate(markdown_files, 1):
    try:
        file_path = os.path.join(dataset_dir, markdown_file)
        print(f"Processing {i}/{len(markdown_files)}: {markdown_file}...")

        # Load the Markdown file
        loader = UnstructuredMarkdownLoader(file_path=file_path)
        documents = loader.load()
        print(f"  Loaded {len(documents)} pages")
        
        # Split the documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        chunks = text_splitter.split_documents(documents)
        print(f"  Created {len(chunks)} chunks")
        
        # Create a unique collection name (ChromaDB collection names must be valid)
        collection_name = f"markdown_{i:03d}_{os.path.splitext(markdown_file)[0][:50]}"
        collection_name = "".join(c for c in collection_name if c.isalnum() or c in ['_', '-'])

        # Create database directory for this Markdown
        db_path = f"./chroma_db/chroma_{os.path.splitext(markdown_file)[0]}"
        os.makedirs(db_path, exist_ok=True)
        
        # Create a vector store with explicit collection name
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=db_path,
            collection_name=collection_name
        )
        
        # Persist the vector store
        vectordb.persist()
        print(f"Created vector store with {len(chunks)} chunks")
        
        # Clean up to free memory
        del vectordb
        
    except Exception as e:
        print(f"Error processing {markdown_file}: {e}")
        continue

print("All Markdown files processed and embedded successfully!")

Processing 1/1: visie_tech.markdown...
  Loaded 1 pages
  Created 25 chunks
  Loaded 1 pages
  Created 25 chunks


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Created vector store with 25 chunks
All Markdown files processed and embedded successfully!
