In [10]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

# Connect to local Qdrant container
client = QdrantClient(host="localhost", port=6333)


Prepare the dataset.

In [1]:
# Add parent directory to sys.path
import sys
import os
import re
import importlib
import pdfminer
from pdfminer.high_level import extract_text
from tqdm import tqdm

# Get relevant directory paths
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
notebook_dir = os.getcwd()
dataset_dir = os.path.join(parent_dir, "dataset")

sys.path.append(parent_dir)

In [2]:
# Code to extract the dataset
def read_pdf(filepath): # takes in the absolute filepath
    text = extract_text(filepath)
    return text

In [3]:
# Code to clean the dataset
def clean_text(text):
    # Remove bracketed citations like [1], [12], etc.
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove parenthetical citations like (Smith et al., 2020)
    text = re.sub(r'\([^\)]+et al\.,?\s*\d{4}\)', '', text)
    
    # Remove section numbers at start of lines like "1.", "2.1", "3.2.1", etc.
    text = re.sub(r'^\d+(\.\d+)*\s+', '', text, flags=re.MULTILINE)
    
    # Remove all-caps headings or lines that are just numbers
    text = re.sub(r'^[A-Z\s]{3,}$', '', text, flags=re.MULTILINE)
    
    # --- Remove table-like lines ---
    # Heuristic: lines with lots of whitespace-separated "columns" or digits
    lines = text.split("\n")
    cleaned_lines = []
    for line in lines:
        # Count words separated by 2+ spaces or tabs
        if len(re.findall(r'  +|\t', line)) >= 2:
            continue  # likely a table row → skip
        # Skip lines with many pipe characters (markdown-style tables)
        if line.count("|") >= 2:
            continue
        cleaned_lines.append(line)
    
    text = "\n".join(cleaned_lines)
    
    # Optional: collapse multiple spaces/newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    
    return text.strip()

In [4]:
def save_cleaned_text(text, output_dir, filename):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Create the full output path
    output_path = os.path.join(output_dir, f"{filename}.txt")

    # Save text to the file (UTF-8 handles special characters)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)

    # print(f"✅ Saved cleaned text to: {output_path}")

In [5]:
dataset_dir = os.path.join(dataset_dir, "TP53_effects_breast_cancer")

In [6]:
def process_pdfs(input_dir, output_dir):
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith(".pdf")]

    for file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
        pdf_path = os.path.join(input_dir, file)
        raw_text = read_pdf(pdf_path)
        # cleaned_text = clean_text(raw_text)

        filename = os.path.splitext(file)[0]
        save_cleaned_text(raw_text, output_dir, filename)

processed_dataset_dir = os.path.join(parent_dir, "processed_dataset")
process_pdfs(dataset_dir, processed_dataset_dir)

Processing PDFs:   0%|          | 0/13 [00:00<?, ?file/s]

Processing PDFs: 100%|██████████| 13/13 [01:09<00:00,  5.33s/file]


In [78]:
# Create collection
collection_name = "demo_collection"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance="Cosine")
)

# Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  client.recreate_collection(


In [3]:
# Sample data
texts = [
    "Machine learning enables computers to learn from data.",
    "Vector databases store high-dimensional embeddings efficiently.",
    "Qdrant makes it easy to perform semantic search."
]

# Generate embeddings
embeddings = model.encode(texts)

In [4]:
# Insert points
points = [
    PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
    for i in range(len(texts))
]
client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
# Verify
result = client.scroll(collection_name=collection_name, limit=3)
for point in result[0]:
    print(point.payload)

{'text': 'Machine learning enables computers to learn from data.'}
{'text': 'Vector databases store high-dimensional embeddings efficiently.'}
{'text': 'Qdrant makes it easy to perform semantic search.'}
