## Imports

In [7]:
# !pip install transformers

In [8]:
# !pip install chromadb

In [9]:
# !pip install pdfplumber

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pdfplumber
from transformers import BertTokenizer, BertModel
import chromadb
import torch
import numpy as np
import os
import csv
import json

## Bert Model

In [6]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

## Data Loading & Creating Embeddings

In [10]:
# Function to read PDF and extract text using pdfplumber
def read_pdf_plumber(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# Extract text from PDF
pdf_text = read_pdf_plumber("/content/Data.pdf")

In [11]:
# Split the document into chunks that fit within BERT's max token limit
def chunk_text(text, max_length=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word.split()) > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word.split())
        else:
            current_chunk.append(word)
            current_length += len(word.split())

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Chunk the document
text_chunks = chunk_text(pdf_text)

# Get embeddings for each chunk
chunk_embeddings = []
chunk_data = []  # Added missing array for storing chunk data

# Make sure to use PyTorch tensors
model.eval()  # Set the model to evaluation mode

for i, chunk in enumerate(text_chunks):  # Added index to track chunk number
    # Use 'pt' for PyTorch tensors instead of 'tf'
    encoded_input = tokenizer(chunk,
                              return_tensors='pt',  # PyTorch tensors
                              truncation=True,
                              padding=True,
                              max_length=512)

    # Disable gradient calculation for inference
    with torch.no_grad():
        # Pass the input to the model
        outputs = model(**encoded_input)  # Use unpacking operator

    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state

    # Calculate the mean of all token embeddings to get a single vector for the chunk
    chunk_embedding = torch.mean(embeddings, dim=1).squeeze().numpy()
    chunk_embeddings.append(chunk_embedding)

    # Store chunk data for export
    chunk_data.append({
        "chunk_id": f"chunk_{i+1}",
        "text": chunk,
        "embedding": chunk_embedding.tolist()
    })

# Average all chunk embeddings to get a document-level embedding
document_embedding = np.mean(chunk_embeddings, axis=0)

## Storing Embeddings in Chroma Database

In [12]:
# Initialize Chroma client
client = chromadb.Client()

# Create a collection in the Chroma database
collection = client.create_collection("pdf_embeddings")

# Store the embeddings in Chroma
collection.add(
    documents=[pdf_text],  # Store the full text
    embeddings=[document_embedding.tolist()],  # Convert numpy array to list
    metadatas=[{"source": "Data.pdf"}],  # Metadata
    ids=["document_1"]  # Unique ID
)

print("Document embeddings stored in Chroma database successfully!")

Document embeddings stored in Chroma database successfully!


## Saving Embeddings locally in JSON format

In [13]:
# Create output directory for saved files
output_dir = "embeddings_output"
os.makedirs(output_dir, exist_ok=True)

# Save to JSON file
json_file_path = os.path.join(output_dir, "pdf_embeddings.json")
with open(json_file_path, 'w') as json_file:
    json_data = {
        "document": {
            "source": "Data.pdf",
            "full_text": pdf_text[:1000] + "..." if len(pdf_text) > 1000 else pdf_text,  # Truncate if too long
            "embedding": document_embedding.tolist()
        },
        "chunks": chunk_data
    }
    json.dump(json_data, json_file, indent=2)

## Saving Embedding Locally in CSV file

In [14]:
# Save all embeddings to a single CSV file
embeddings_csv_path = os.path.join(output_dir, "embeddings.csv")
with open(embeddings_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

    # Write header
    header = ["id", "type", "text_preview"]
    embedding_dim = len(document_embedding)
    for i in range(embedding_dim):
        header.append(f"dim_{i}")
    writer.writerow(header)

    # Write document embedding
    doc_row = ["document_1", "document", pdf_text[:100] + "..."]
    doc_row.extend(document_embedding.tolist())
    writer.writerow(doc_row)

    # Write chunk embeddings
    for i, chunk_data_item in enumerate(chunk_data):
        chunk_row = [
            chunk_data_item["chunk_id"],
            "chunk",
            chunk_data_item["text"][:100] + "..." if len(chunk_data_item["text"]) > 100 else chunk_data_item["text"]
        ]
        chunk_row.extend(chunk_data_item["embedding"])
        writer.writerow(chunk_row)