## Imports

In [7]:
# !pip install pdfplumber

In [8]:
# !pip install langchain-community

In [9]:
# !pip install chromadb

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pdfplumber
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import os
from tokenizers.normalizers import BertNormalizer
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Loading Data and Generating Embeddings

In [10]:
# Load PDF text
def read_pdf_plumber(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

pdf_text = read_pdf_plumber("/content/Data.pdf")

# Convert to Document object
document = Document(page_content=pdf_text)

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents([document])

print(f"Text split into {len(chunks)} chunks")

Text split into 199 chunks


In [11]:
# Load and process vocab mappings
file_path = os.path.join(os.getcwd(), '/content/vocab_mappings.txt')

with open(file_path, 'r', encoding='utf-8') as f:
    mappings = f.read().strip().split('\n')

mappings = {m[0]: m[2:] for m in mappings}

# Initialize normalizer
norm = BertNormalizer(lowercase=False, strip_accents=True, clean_text=True, handle_chinese_chars=True)

def normalize(text):
    text = [norm.normalize_str(s) for s in text.split('\n')]
    out = []
    for s in text:
        norm_s = ''.join(mappings.get(c, ' ') for c in s)
        out.append(norm_s)
    return '\n'.join(out)

In [None]:
!huggingface-cli login --token "your-api-key"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `matscibert` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `matscibert`


In [13]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
model = AutoModel.from_pretrained('m3rg-iitd/matscibert')

# Normalize and tokenize
norm_sents = [normalize(chunk.page_content) for chunk in chunks]  # Normalize chunks of text
tokenized_sents = tokenizer(norm_sents, padding=True, truncation=True, return_tensors='pt')

# Pass the tokenized sentences through the model
with torch.no_grad():
    last_hidden_state = model(**tokenized_sents).last_hidden_state

# Extract embeddings from the last hidden state (take the mean of token embeddings)
sentence_embeddings = last_hidden_state.mean(dim=1)  # Averaging embeddings over tokens


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/467k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at m3rg-iitd/matscibert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Saving Embeddings into Chroma Database

In [14]:
import chromadb

# Initialize Chroma client
client = chromadb.Client()

# Create a collection in the Chroma database
collection = client.create_collection("pdf_embeddings")

# Store the embeddings in Chroma
for i, embedding in enumerate(sentence_embeddings):
    collection.add(
        documents=[norm_sents[i]],  # The normalized sentence
        embeddings=[embedding.numpy().tolist()],  # Convert tensor to list
        metadatas=[{"source": f"chunk_{i}"}],  # Metadata (optional)
        ids=[f"sentence_{i}"]  # Unique ID for each sentence
    )

print("Embeddings stored in Chroma database.")


Embeddings stored in Chroma database.


In [16]:
# Retrieve the stored documents and embeddings using their IDs in order
results = collection.get(ids=[f"sentence_{i}" for i in range(len(sentence_embeddings))], include=["documents", "embeddings"])

## Saving files locally in JSON Format

In [18]:
import chromadb
import json
import numpy as np  # Import NumPy
from google.colab import files

# Initialize ChromaDB client and connect to the collection
chroma_client = chromadb.Client()  # In-memory client
collection = chroma_client.get_collection(name="pdf_embeddings")  # Use the correct collection name

# Retrieve all embeddings along with their document IDs
results = collection.get(include=["embeddings", "metadatas", "documents"])

# Structure the data
embeddings_data = []
for doc_id, embedding, metadata, document in zip(results['ids'], results['embeddings'], results['metadatas'], results['documents']):
    embeddings_data.append({
        "document_id": doc_id,
        "embedding": embedding.tolist() if isinstance(embedding, np.ndarray) else embedding,  # Convert ndarray to list
        "metadata": metadata,  # Optional, only if you stored metadata
        "document": document
    })

# Save as JSON
json_filename = "chroma_embeddings.json"
with open(json_filename, "w") as f:
    json.dump(embeddings_data, f, indent=4)

# Download the JSON file
files.download(json_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Saving files locally in CSV format

In [17]:
import numpy as np
import pandas as pd

# Convert embeddings to a NumPy array
embeddings_array = np.array([embedding.numpy() for embedding in sentence_embeddings])

# Save as CSV
csv_filename = "embeddings.csv"
df = pd.DataFrame(embeddings_array)
df.to_csv(csv_filename, index=False)

In [19]:
# Save as CSV
import csv
csv_filename = "chroma_embeddings.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # Write header
    writer.writerow(["document_id", "embedding", "metadata", "document"])
    # Write data
    for entry in embeddings_data:
        writer.writerow([
            entry["document_id"],
            json.dumps(entry["embedding"]),  # Store embedding as JSON string
            entry["metadata"],  # Already converted to JSON string
            entry["document"]
        ])

# Download the CSV file
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>