# Import the necessary libraries, modules, Datasets

## Import Libraries and mModules

In [17]:
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split

## Import Dataset


In [18]:
df_labeled = pd.read_parquet("hf://datasets/qiaojin/PubMedQA/pqa_labeled/train-00000-of-00001.parquet")
df_artificial = pd.read_parquet("hf://datasets/qiaojin/PubMedQA/pqa_artificial/train-00000-of-00001.parquet")
df_unlabeled = pd.read_parquet("hf://datasets/qiaojin/PubMedQA/pqa_unlabeled/train-00000-of-00001.parquet")

#Add final_answer column to the unlabeled dataset
df_unlabeled["final_decision"]=None

df=pd.concat([df_labeled, df_artificial, df_unlabeled], ignore_index=True)

print(f"Labled Shape: {df_labeled.shape}")
print(f"Artificial Shape: {df_artificial.shape}")
print(f"Unlabled: {df_unlabeled.shape}")
print(f"Combined Shape: {df.shape}")

print(df.info())

Labled Shape: (1000, 5)
Artificial Shape: (211269, 5)
Unlabled: (61249, 5)
Combined Shape: (273518, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273518 entries, 0 to 273517
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   pubid           273518 non-null  int32 
 1   question        273518 non-null  object
 2   context         273518 non-null  object
 3   long_answer     273518 non-null  object
 4   final_decision  212269 non-null  object
dtypes: int32(1), object(4)
memory usage: 9.4+ MB
None


## Split Dataset into test and train

### First split the dataframe into train and test

In [19]:
train_set, test_set=train_test_split(df, test_size=0.2, random_state=42)

In [20]:
print(f"Training Set Rows: {train_set.shape}")
print(f"Test Set Rows: {test_set.shape}")

Training Set Rows: (218814, 5)
Test Set Rows: (54704, 5)


### Flatten the dataset
* Currently the column in the datset looks like this(a dictionary with keys-contexts, lables, meshes:
    {
    'contexts': [
        "Use of aspirin is common.",          # Sentence 1
        "We studied 500 patients.",           # Sentence 2
        "Results showed reduced pain."        # Sentence 3
    ],
    'labels': ["BACKGROUND", "METHODS", "RESULTS"],
    'meshes': ["Aspirin", "Pain"]

 * We will check if each row contains a key called contexts and if it does we will will join every sentence from the contexts into a single space separated text

In [21]:
def flatten_context(row):
    if isinstance(row['context'], dict) and 'contexts' in row['context']:
        return " ".join(row['context']['contexts'])

    return ""
    

* We will apply the flatten_context function to every row in the train set and test set
* We apply it to test set also just for uniformity sake.
* We also create/use the .copy() function to break the test_set and train_set links from the original datafram(df)

In [22]:
train_set=train_set.copy()
train_set['full_text']=train_set.apply(flatten_context, axis=1)

test_set=test_set.copy()
test_set['full_text']=test_set.apply(flatten_context, axis=1)

In [23]:
print(train_set.info())
print(f"No of unique pubids: {train_set['pubid'].nunique()}")

<class 'pandas.core.frame.DataFrame'>
Index: 218814 entries, 141060 to 121958
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   pubid           218814 non-null  int32 
 1   question        218814 non-null  object
 2   context         218814 non-null  object
 3   long_answer     218814 non-null  object
 4   final_decision  169916 non-null  object
 5   full_text       218814 non-null  object
dtypes: int32(1), object(5)
memory usage: 10.9+ MB
None
No of unique pubids: 218814


In [24]:
train_set.head()

Unnamed: 0,pubid,question,context,long_answer,final_decision,full_text
141060,23356518,Do magnolia polyphenols attenuate oxidative an...,{'contexts': ['The bark of magnolia has been u...,This study highlights the important role of NA...,yes,The bark of magnolia has been used in Oriental...
101826,25476117,Are olfactory identification deficits at ident...,{'contexts': ['We have previously reported tha...,These results suggest that impaired OI is not ...,yes,We have previously reported that olfactory ide...
39223,18289138,Are no differences seen in the regional cerebr...,{'contexts': ['Anorexia nervosa (AN) is subdiv...,Abnormalities of the neuronal circuits contain...,yes,Anorexia nervosa (AN) is subdivided into the r...
42170,21725797,Does proton pump inhibitor prophylaxis increas...,{'contexts': ['Stress-related mucosal damage i...,The use of a PPI as a prophylactic treatment a...,yes,Stress-related mucosal damage is an erosive pr...
104449,23731765,Does smooth muscle cell transplantation improv...,{'contexts': ['Damage to smooth muscle has bee...,During the 2-week follow-up period after trans...,yes,Damage to smooth muscle has been the primary c...


# Chunking

In [None]:
import pandas as pd
from transformers import AutoTokenizer

# 1. Setup the Tokenizer (Using a fast standard one, or use your Llama tokenizer)
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

# 2. Define the Chunking Function
def chunk_text_to_df(input_df, text_col='full_text', id_col='pubid', chunk_size=256, overlap=32):
    all_chunks = []
    
    # Calculate the stride (how many tokens to move forward)
    stride = chunk_size - overlap
    
    print(f"Processing {len(input_df)} documents...")
    
    for index, row in input_df.iterrows():
        doc_id = row[id_col]
        text = row[text_col]
        
        # Tokenize the entire text (return tensors='pt' is slower for loops, standard lists are fine here)
        tokens = tokenizer.encode(text, add_special_tokens=False)
        
        # Create sliding window chunks
        chunk_index = 0
        for i in range(0, len(tokens), stride):
            # Define the slice range
            chunk_tokens = tokens[i : i + chunk_size]
            
            # Decode back to text
            chunk_text = tokenizer.decode(chunk_tokens)
            
            # Create the unique chunk ID (e.g., doc_42_chunk_00)
            chunk_id = f"{doc_id}_chunk_{chunk_index:02d}"
            
            # Append to list
            all_chunks.append({
                "chunk_id": chunk_id,
                "document_id": doc_id,
                "chunk_index": chunk_index,
                "chunk_text": chunk_text
            })
            
            chunk_index += 1
            
            # Stop if this was the last chunk (handling exact edge cases)
            if i + chunk_size >= len(tokens):
                break
                
    # Create the final DataFrame
    return pd.DataFrame(all_chunks)

# 3. Apply it to your existing 'train_set'
# Ensure your train_set has 'full_text' and a unique ID column (like 'pubid')
# If you don't have an ID column, create one first: train_set['pubid'] = train_set.index

chunk_df = chunk_text_to_df(
    train_set, 
    text_col='full_text', 
    id_col='pubid',  # Replace with your actual ID column name
    chunk_size=256, 
    overlap=32
)

# 4. View the Result
print(chunk_df.head())
print(f"Total chunks generated: {len(chunk_df)}")

## Save the chunked data to disk/working directory as HF Dataset

In [None]:
from datasets import load_from_disk
from datasets import Dataset
hf_chunk_df=Dataset.from_pandas(chunk_df)

# 1. Save the dataset to the output directory
save_path = "./chunked_data_v1"
hf_chunk_df.save_to_disk(save_path)

print(f"✅ Dataset saved successfully to {save_path}")

Zip the chunk table folder

In [None]:
!zip -r PubMED_RAG_chunk_table.zip chunked_data_v1

## Load the saved chunk tabe as a HF Dataset

In [1]:
from datasets import load_from_disk
import os

save_path = "/kaggle/input/pubmed-rag/chunked_data_v1"

# Check if file exists first to avoid errors
if os.path.exists(save_path):
    # Load it directly from disk (takes milliseconds)
    hf_chunked_df = load_from_disk(save_path)
    print("✅ Loaded dataset from disk!")
    
    # Verify it looks right
    print(hf_chunked_df)
else:
    print("⚠️ No saved dataset found. You need to run the chunking code first.")

chunked_df=hf_chunked_df.to_pandas()

print("The dataframe looks like this: ")
print(chunked_df)

✅ Loaded dataset from disk!
Dataset({
    features: ['chunk_id', 'document_id', 'chunk_index', 'chunk_text'],
    num_rows: 421148
})
The dataframe looks like this: 
                 chunk_id  document_id  chunk_index  \
0       23356518_chunk_00     23356518            0   
1       23356518_chunk_01     23356518            1   
2       23356518_chunk_02     23356518            2   
3       25476117_chunk_00     25476117            0   
4       25476117_chunk_01     25476117            1   
...                   ...          ...          ...   
421143  27011950_chunk_00     27011950            0   
421144  20463894_chunk_00     20463894            0   
421145  20463894_chunk_01     20463894            1   
421146  19663554_chunk_00     19663554            0   
421147  19663554_chunk_01     19663554            1   

                                               chunk_text  
0       The bark of magnolia has been used in Oriental...  
1       (DHE) was used to assay superoxide production

## Reset Index to freeze chunk order

In [2]:
chunk_df=chunked_df.reset_index(drop=True)
print(f"frozen datafram: \n{chunked_df}")

frozen datafram: 
                 chunk_id  document_id  chunk_index  \
0       23356518_chunk_00     23356518            0   
1       23356518_chunk_01     23356518            1   
2       23356518_chunk_02     23356518            2   
3       25476117_chunk_00     25476117            0   
4       25476117_chunk_01     25476117            1   
...                   ...          ...          ...   
421143  27011950_chunk_00     27011950            0   
421144  20463894_chunk_00     20463894            0   
421145  20463894_chunk_01     20463894            1   
421146  19663554_chunk_00     19663554            0   
421147  19663554_chunk_01     19663554            1   

                                               chunk_text  
0       The bark of magnolia has been used in Oriental...  
1       (DHE) was used to assay superoxide production ...  
2       ted IFNγ±LPS-induced iNOS expression, NO, and ...  
3       We have previously reported that olfactory ide...  
4       up between in

# Build the Dense Retrieval Class

## Define the class structure

In [4]:
class DenseRetrievalKnowledgeBase:
    def __init__(self, chunked_df):
        
        self.chunked_df=chunked_df
        self.metadata=chunked_df.to_dict("records")
        self.embeddings=None
        self.index=None
        

* self.metadata is a list of dictionaries that looks like this:
 [
  {
    "chunk_id": "doc_01_chunk_00",
    "document_id": "doc_01",
    "chunk_index": 0,
    "chunk_text": "Aspirin reduces inflammation..."
  },
  {
    "chunk_id": "doc_01_chunk_01",
    "document_id": "doc_01",
    "chunk_index": 1,
    "chunk_text": "COX inhibition mechanisms..."
  },
  {
    "chunk_id": "doc_02_chunk_00",
    "document_id": "doc_02",
    "chunk_index": 0,
    "chunk_text": "Colorectal cancer risk..."
  }

* Each element of the list is a dictionary that contains the corresponding chunk_id, doc_id, chunk_index and chunk_text.

* The ith chunk reprsents the ith embedding in the FAISS vectorDB.

* This metadata list heps to map the embeddings to the actual text and chunk data


## Create the class object

In [5]:
dense_kb=DenseRetrievalKnowledgeBase(chunked_df)

print(len(dense_kb.metadata))
print(dense_kb.metadata[0].keys())
print(dense_kb.metadata[0]["chunk_text"][:200])

421148
dict_keys(['chunk_id', 'document_id', 'chunk_index', 'chunk_text'])
The bark of magnolia has been used in Oriental medicine to treat a variety of remedies, including some neurological disorders. Magnolol (Mag) and honokiol (Hon) are isomers of polyphenolic compounds f


## Load Embedding Model

In [6]:
from sentence_transformers import SentenceTransformer

model_name="BAAI/bge-m3"
dense_model=SentenceTransformer(model_name)

2026-01-06 06:36:27.691175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767681387.886319      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767681387.941515      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767681388.413248      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767681388.413284      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767681388.413287      55 computation_placer.cc:177] computation placer alr

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

## Prepare to load the chunks into emebdding model

Save all the chunk texts in a list

In [8]:
chunk_texts=[m["chunk_text"] for m in dense_kb.metadata]

Embed the texts

In [11]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Setup & Check Hardware
# ---------------------------------------------------------
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

if torch.cuda.device_count() < 2:
    print("⚠️ Warning: Less than 2 GPUs detected. Multi-process might not yield speedups.")

# 2. Start Pool
# ---------------------------------------------------------
# Ensure model is fresh or on CPU before pooling
# dense_model = SentenceTransformer('your-model-name') 

# Start the pool
pool = dense_model.start_multi_process_pool()
print("Multi-GPU pool started successfully.")

try:
    # 3. Encode
    # ---------------------------------------------------------
    # chunk_texts should be a simple list of strings
    dense_embeddings = dense_model.encode_multi_process(
        sentences=chunk_texts,
        pool=pool,
        batch_size=128, # 128 is usually safe for T4s (16GB VRAM). If OOM, drop to 64.
        show_progress_bar=True
    )
finally:
    # 4. Stop Pool (Wrapped in try/finally to ensure it closes)
    # ---------------------------------------------------------
    dense_model.stop_multi_process_pool(pool)
    print("Multi-GPU pool stopped.")

# 5. Manual Normalization
# ---------------------------------------------------------
print("Normalizing embeddings...")

# L2 Norm: SQRT( sum(x^2) )
# We use keepdims=True to broadcast the division properly across the (N, 768) shape
norms = np.linalg.norm(dense_embeddings, axis=1, keepdims=True)

# Divide by norm to get unit vectors. 
# Clip the small norms to avoid division by zero errors for empty strings.
dense_embeddings = dense_embeddings / np.clip(norms, 1e-12, None)

print(f"Final normalized shape: {dense_embeddings.shape}")

CUDA available: True
Number of GPUs: 2


2026-01-06 06:49:40.386465: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767682180.407541     209 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767682180.414018     209 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767682180.430695     209 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767682180.430720     209 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767682180.430723     209 computation_placer.cc:177] computation placer alr

Multi-GPU pool started successfully.


  dense_embeddings = dense_model.encode_multi_process(


Chunks:   0%|          | 0/85 [00:00<?, ?it/s]

Multi-GPU pool stopped.
Normalizing embeddings...
Final normalized shape: (421148, 1024)


dense_embeddings = dense_model.encode(
    chunk_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)


## Save the embeddings along with its corresponding chunk

In [13]:
import pickle

# 1. Create a data structure (List of dictionaries is common for RAG)
data_to_save = {
    "texts": chunk_texts,          # The list of strings you encoded
    "embeddings": dense_embeddings # The numpy array
}

# 2. Save to a pickle file
with open('rag_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

print("Saved 'rag_data.pkl' successfully.")

Saved 'rag_data.pkl' successfully.


In [14]:
!zip rag_data.zip rag_data.pkl

  adding: rag_data.pkl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 17%)


## Load the saved embeddings
From now on to use the embeddings we can use the embeddings from the input PubMed RAG dataset
* The dataset looks like this:  
  
{
      

    "texts":  [ "text_chunk_0", "text_chunk_1", ... "text_chunk_N" ],
    
    "embeddings":  [[ 0.12, -0.45, ... ],  # Vector for chunk_0
                    [ 0.88,  0.02, ... ],  # Vector for chunk_1
                    ...
                    [ 0.05,  0.11, ... ]]  # Vector for chunk_N
}