In [None]:
# TODO: author email ending removal 
# 

In [1]:
import os, json
import copy
from tqdm import tqdm
from pymilvus import connections,Collection, utility, FieldSchema, CollectionSchema, DataType
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

device = "cuda" if torch.cuda.is_available() else "cpu"
# for big model BAAI/bge-m3
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device=device)
BATCH_SIZE = 100
load_dotenv()


True

In [2]:
print(device)
print(torch.version.cuda)

cuda
11.8


In [3]:

connections.connect(
    alias="default",
    host="127.0.0.1",
    port="19530",
    timeout=5,
    secure=False
)

print("Connected to Milvus:", utility.get_server_version())


Connected to Milvus: v2.3.21


In [4]:

# incase I need to delete it
collection_name = "osdr"

# Check if collection exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)
    print(f"Collection '{collection_name}' deleted.")
else:
    print(f"Collection '{collection_name}' does not exist.")

Collection 'osdr' does not exist.


In [5]:
def load_docs(folder_path:str) -> list:
    """load json files from specified folder into list, not load if text field is empty"""
    docs = list()

    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            with open(os.path.join(folder_path, file) , "r", encoding="utf-8") as doc:
                data = json.load(doc)
                if data["description"] != "":
                    docs.append(data)
                
    return docs

In [6]:

folder_path = "./data/osdr_raw/"
documents = load_docs(folder_path)


In [7]:
def clean_osd_name(osd:json):
    """clean osd name from trailing escape characters and spaces"""
    osd["study_name"] = osd["study_name"].replace(r"\n","").strip()
def clean_text(osd:json):
    """strip main text"""
    osd["text"] = osd["text"].strip()

def remove_email(osd:json):
    if len(osd["authors"]) > 1:
        authors_cleaned = []
    
        for author in osd["authors"]:
            if author.endswith("email"):
                author = author.replace("email","")
            author = author.strip()
            authors_cleaned.append(author)
        osd["authors"] = authors_cleaned



In [None]:
for doc in documents:
    clean_osd_name(doc)
    remove_email(doc)


In [None]:
osd_split_list = []
for doc in documents:
    doi = doc["doi"]
    name = doc["study_name"]
    id = doc['genelab_id']
    organisms = ",".join(doc['organisms'])
    authors = ",".join(doc['authors'])
    link = doc['link']
    
    desc = doc['description']
    base_info = {"doi":doi, 'name':name, 'study_id':id, "organisms":organisms, "authors":authors,"link":link}
    main_desc_doc = copy.deepcopy(base_info)
    main_desc_doc["type"] = "description"
    main_desc_doc["text"] = desc
    main_desc_doc['protocole_name'] = ""
    # add main description doc
    osd_split_list.append(main_desc_doc)

    protocoles = doc["protocole_samples"]
    if len(protocoles) > 1:
        for protocole in protocoles:
            protocole_doc = copy.deepcopy(base_info)
            protocole_doc['type'] = "protocole"
            protocole_doc["text"] = protocole['description']
            protocole_doc['protocole_name'] = protocole["name"]
            osd_split_list.append(protocole_doc)

In [18]:
import pickle


def save_to_pickle(filename,object_to_save):
    if ".pkl" not in filename:
        filename = filename+".pkl"
    with open(filename, "wb") as f:
        pickle.dump(object_to_save,f)

def load_pickle(filename):
    if ".pkl" not in filename:
        filename = filename + ".pkl"
    with open(filename, "rb") as f:
        return pickle.load(f)
# save_to_pickle("semantic_chunks.pkl", all_chunks)

In [20]:
save_to_pickle("osd_split_list.pkl",osd_split_list )


{'doi': '10.25966/t6db-f633',
 'name': 'Expression data from drosophila melanogaster',
 'study_id': 'GLDS-1',
 'organisms': 'Drosophila melanogaster',
 'authors': 'Deborah Kimbrell,Michael George',
 'link': 'https://osdr.nasa.gov/bio/repo/data/studies/OSD-1',
 'type': 'description',
 'text': 'Space travel presents unlimited opportunities for exploration and discovery, but requires a more complete understanding of the immunological consequences of long-term exposure to the conditions of spaceflight. To understand these consequences better and to contribute to design of effective countermeasures, we used the Drosophila model to compare innate immune responses to bacteria and fungi in flies that were either raised on earth or in outer space aboard the NASA Space Shuttle Discovery (STS-121). Microarrays were used to characterize changes in gene expression that occur in response to infection by bacteria and fungus in drosophila that were either hatched and raised in outer space (microgravit

In [26]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=120)
all_chunks = []

for doc in tqdm(osd_split_list,"Processing Docs"):
    chunks = text_splitter.create_documents([doc["text"]])
    for chunk in chunks:
        all_chunks.append({
            "doi": doc["doi"],
            "name": doc["name"],
            "study_id": doc["study_id"],
            "organisms": doc["organisms"],
            "authors":doc["authors"],
            "link": doc["link"],
            "type": doc["type"],
            "text" : chunk.page_content,
            "protocole_name":doc["protocole_name"]
        })
    

Processing Docs: 100%|██████████| 1557/1557 [00:00<00:00, 5782.61it/s]


In [27]:
len(all_chunks)



4210

In [28]:
all_chunks[0]

{'doi': '10.25966/t6db-f633',
 'name': 'Expression data from drosophila melanogaster',
 'study_id': 'GLDS-1',
 'organisms': 'Drosophila melanogaster',
 'authors': 'Deborah Kimbrell,Michael George',
 'link': 'https://osdr.nasa.gov/bio/repo/data/studies/OSD-1',
 'type': 'description',
 'text': 'Space travel presents unlimited opportunities for exploration and discovery, but requires a more complete understanding of the immunological consequences of long-term exposure to the conditions of spaceflight. To understand these consequences better and to contribute to design of effective countermeasures, we used the Drosophila model to compare innate immune responses to bacteria and fungi in flies that were either raised on earth or in outer space aboard the NASA Space Shuttle Discovery (STS-121). Microarrays were used to characterize changes in gene expression that occur in response to',
 'protocole_name': ''}

In [29]:
all_chunks_length_cleaned = []
dropped_chunks = 0
for chunk in all_chunks:
    if "List of" not in chunk['text']:
        if len(chunk['text'].strip()) > 100:
            all_chunks_length_cleaned.append(chunk)
            
        else:
            dropped_chunks +=1
    else:
        dropped_chunks+=1
print("dropped_chunks:" + str(dropped_chunks))


dropped_chunks:32


In [30]:
import numpy as np 

BATCH_SIZE = 256
all_chunks = all_chunks_length_cleaned

def embed_chunks_in_batches(chunks, batch_size=BATCH_SIZE):
    vectors = []
    texts = [c["text"] for c in chunks]
    batch_vectors = model.encode(
        texts, 
        normalize_embeddings=True, 
        batch_size=batch_size,
        show_progress_bar=True
    ) 
    vectors.extend(batch_vectors)
    return np.array(vectors)

In [31]:
vectors = embed_chunks_in_batches(all_chunks)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [32]:
save_to_pickle("vectors_osdr_v1.pkl", vectors)


In [None]:
vectors = load_pickle("vectors_osdr_v1.pkl")

In [33]:
bad_chunks = [c for c in all_chunks if len(c["text"]) > 600]
for bc in bad_chunks[:5]:
    print(len(bc["text"]), repr(bc["text"][:120]))


In [34]:
fields = [
    FieldSchema(
        name="id",
        dtype=DataType.INT64,
        is_primary=True,
        auto_id=True
    ),
    FieldSchema(
        name="embedding",
        dtype=DataType.FLOAT_VECTOR,
        dim=384,
        metric_type="COSINE"
    ),
    FieldSchema(
        name="doi",
        dtype=DataType.VARCHAR,
        max_length=200
    ),
    FieldSchema(
        name="name",
        dtype=DataType.VARCHAR,
        max_length=500
    ),
    FieldSchema(
        name="study_id",
        dtype=DataType.VARCHAR,
        max_length=50
    ),
    FieldSchema(
        name="organisms",
        dtype=DataType.VARCHAR,
        max_length=1000  # can store comma-separated list
    ),
    FieldSchema(
        name="authors",
        dtype=DataType.VARCHAR,
        max_length=2000  # store as list or comma-separated
    ),
    FieldSchema(
        name="link",
        dtype=DataType.VARCHAR,
        max_length=500
    ),
    FieldSchema(
        name="type",
        dtype=DataType.VARCHAR,
        max_length=20
    ),
    FieldSchema( name="text", dtype=DataType.VARCHAR, max_length=2000 ),
    FieldSchema(
        name="protocole_name",
        dtype=DataType.VARCHAR,
        max_length=200
    ) 
]
schema = CollectionSchema(
    fields,
    description="RAG collection with OSD experiment metadata"
)

collection = Collection("osdr", schema)

In [37]:
from datetime import datetime

for i in range(0, len(all_chunks), BATCH_SIZE):
    batch_vectors = vectors[i:i+BATCH_SIZE]
    batch_names = [c["name"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_texts = [c["text"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_authors = [c["authors"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_study_ids =  [c["study_id"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_organisms =  [c["organisms"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_links = [c["link"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_types = [c["type"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_protocole_names = [c["protocole_name"] for c in all_chunks[i:i+BATCH_SIZE]]
    batch_doi = []
    for c in all_chunks[i:i+BATCH_SIZE]:
        doi = c["doi"]
        if doi:
            batch_doi.append(doi)
        else:
            doi = "None"
            batch_doi.append(doi)
        
        

    collection.insert([batch_vectors, batch_doi,batch_names, batch_study_ids, batch_organisms,batch_authors,batch_links,batch_types,batch_texts,batch_protocole_names])