In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from utils import find_elem

In [2]:
BASE_URL = "https://osdr.nasa.gov/bio/repo/data/studies/OSD-"
link = BASE_URL + "833"


In [3]:
wd = webdriver.Chrome()
wd.get(link)
time.sleep(1)
wd.maximize_window()


In [None]:
html = wd.page_source #extract webpage html
soup = BeautifulSoup(html, "html.parser")


name_block = soup.find("div",{'class':"flex-container justify-start"}) # extract block where name of OSD study is
# extract name itself 
study_name = name_block.select("body > app-root > repo-view > mat-sidenav-container > mat-sidenav-content > div.flex-container.justify-start > div.three-quarters-width > h1 > span:nth-child(4)")[0].text

# extract genelabID block
b_element = find_elem("//b[contains(text(),'GeneLab ID')]", wd) 
# extract genelabID and replace first :
gene_lab_id = wd.execute_script("return arguments[0].nextSibling.nodeValue.trim();", b_element).replace(":","").strip()

# extract unique doi
doi = find_elem("/html/body/app-root/repo-view/mat-sidenav-container/mat-sidenav-content/div[2]/a[1]",wd).text

# main description of study
description = find_elem('//*[@id="cdk-accordion-child-0"]/div/div/div/description-panel/div[1]/div/p',wd).text
print(description)

# which organisms this study is related to
organisms_block = find_elem('//*[@id="cdk-accordion-child-0"]/div/div/div/description-panel/div[3]/mat-grid-list', wd)
organisms = [organism.text for organism in organisms_block.find_elements(By.CLASS_NAME,"mat-grid-tile-content")]

# An ontology factor is a variable or condition in an experiment that might influence the results. It’s essentially an experimental parameter.
# These ontology links help out RAG system understand that different experiments are related even if the text uses slightly different words.
ontology_block = find_elem('//*[@id="cdk-accordion-child-0"]/div/div/div/description-panel/div[2]/table/tbody', wd)
ontology_block_rows = ontology_block.find_elements(By.TAG_NAME, 'tr')
ontology_concepts = {}
for row in ontology_block_rows:
    key_values = row.find_elements(By.TAG_NAME,"td")
    ontology_concepts[key_values[0].text] = key_values[1].text
print(ontology_concepts)

# extract authors related to this study(contacts)
authors_block = find_elem('//*[@id="cdk-accordion-child-0"]/div/div/div/description-panel/div[6]/div',wd)
authors_elems_list = authors_block.find_elements(By.CLASS_NAME,"contact-container.ng-star-inserted")

authors = []
for elem in authors_elems_list:
    authors.append(elem.text.replace(",","").strip())

# We should also extract protocols which are descriptive of how experiments lasted.
# first we expandtheir descriptions
expand_protocols_btn = find_elem('//*[@id="cdk-accordion-child-4"]/div/div/div/protocols-panel/div/div/button[1]/span[1]', wd)
time.sleep(1.5)
expand_protocols_btn.click()
# find block with all cards
samples_block = find_elem('//*[@id="Samples"]/div/div',wd)
samples_cards = samples_block.find_elements(By.XPATH, "./div")

# extract name and descriptions, store in format List[dict(name:value, description:value)]
samples_info_list = []
for card in samples_cards:
    protocol_sample_name = card.find_element(By.CLASS_NAME,"card-header").text
    nested_div = card.find_element(By.TAG_NAME,"div").find_element(By.TAG_NAME,'div').find_element(By.TAG_NAME,'div')
    sample_description = nested_div.find_elements(By.TAG_NAME,'div')[1].text.replace("Description", "").strip()
    sample_info = {"name":protocol_sample_name, "description": sample_description}
    samples_info_list.append(sample_info)
    
