In [57]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

# Connect to local Qdrant container
client = QdrantClient(host="localhost", port=6333)


Prepare the dataset.

In [74]:
# Add parent directory to sys.path
import sys
import os
import importlib
import pdf_to_clean_text

# Get current notebook directory
notebook_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(parent_dir)

importlib.reload(pdf_to_clean_text)

from pdf_to_clean_text import clean_text, read_pdf2


In [75]:
# Load the dataset.
dataset_dir = os.path.join(os.getcwd(), "..", "dataset")
print(dataset_dir)
print(os.listdir(dataset_dir))


/home/non-sudo/CSE-291-RAG-Project-UCSD/notebooks/../dataset
['TP53_effects_breast_cancer', 'pq2402015632.pdf']


In [76]:
breast_cancer_dir = os.path.join(dataset_dir, "TP53_effects_breast_cancer")

In [77]:
for file in os.listdir(breast_cancer_dir):
    raw_text = read_pdf2(os.path.join(breast_cancer_dir, file))
    # print(raw_text)
    cleaned_text = clean_text(raw_text)
    print(cleaned_text)
    break

Contents lists available at ScienceDirect 

Cancer Treatment Reviews 

journal homepage: www.elsevier.com/locate/ctrv 

Tumour Review 

Germline TP53 pathogenic variants and breast cancer: A narrative review 

Eva Blondeaux a,*, Luca Arecco b,c, Kevin Punie d, Rossella Graffeo e, Angela Toss f, 
Carmine De Angelis g, Lucia Trevisan h, Giulia Buzzatti h, Sabine C. Linn i, Peter Dubsky j, 
Mara Cruellas k, Ann H. Partridge l, Judith Balma˜na k, Shani Paluch-Shimon m, 
Matteo Lambertini b,c 
a Clinical Epidemiology Unit, IRCCS Ospedale Policlinico San Martino, Genoa, Italy 
b Department of Internal Medicine and Medical Specialties (DiMI), School of Medicine, University of Genova, Genoa, Italy 
c Department of Medical Oncology, U.O. Clinica di Oncologia Medica, IRCCS Ospedale Policlinico San Martino, Genoa, Italy 
d Department of General Medical Oncology, University Hospitals Leuven, KU Leuven, Leuven, Belgium 
e Oncology Institute of Southern Switzerland, EOC, Bellinzona, Switzerland 
f D

In [78]:
# Create collection
collection_name = "demo_collection"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance="Cosine")
)

# Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  client.recreate_collection(


In [3]:
# Sample data
texts = [
    "Machine learning enables computers to learn from data.",
    "Vector databases store high-dimensional embeddings efficiently.",
    "Qdrant makes it easy to perform semantic search."
]

# Generate embeddings
embeddings = model.encode(texts)

In [4]:
# Insert points
points = [
    PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
    for i in range(len(texts))
]
client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
# Verify
result = client.scroll(collection_name=collection_name, limit=3)
for point in result[0]:
    print(point.payload)

{'text': 'Machine learning enables computers to learn from data.'}
{'text': 'Vector databases store high-dimensional embeddings efficiently.'}
{'text': 'Qdrant makes it easy to perform semantic search.'}
