In [None]:
novelty-check check --hybrid "Has anyone looked into whether PTEN neddylation aggravates CDK4/6 inhibitor resistance in breast cancer"

In [99]:
from numpy import true_divide
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
# Set up a headless Chrome browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)



def get_question_id(question):
    url = "https://worker.hasanyone.com/api/v1/pqa-query"
    headers = {
        "accept": "*/*",
        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
        "content-type": "application/json",
    }
    data = {"query": question}
    response = requests.post(url, headers=headers, json=data)
    id = response.text
    if id == "Unauthorized":
        raise Exception("Unauthorized, not previously run query.")
    return id

# --- Function to scrape research questions using the POST request ---
def scrape_research_questions():
    url = "https://worker.hasanyone.com/queries"
    headers = {
        "accept": "*/*",
        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
        "content-type": "application/json",
    }
    response = requests.get(url, headers=headers)
    questions = response.json()
    return questions

# --- Function to scrape question details from a given question ID ---
def scrape_question_details(question_id):
    url = f"https://hasanyone.com/?id={question_id}"
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    print(driver.page_source)
    # Extract the question text
    question = soup.find("p", class_="chakra-text").text.strip()

    # Extract the answer
    answer = soup.find("strong", class_="chakra-text").text.strip()

    # Extract the references
    references_section = soup.find("div", id="accordion-panel-2")
    references = []
    if references_section:
        for ref_element in references_section.find_all("a"):
            references.append(ref_element.text.strip())

    # Extract the usage statistics
    stats_section = soup.find("div", id="accordion-panel-3")
    stats = {}
    if stats_section:
        for stat_row in stats_section.find_all("tr"):
            key, value = stat_row.find_all("td")
            stats[key.text.strip()] = value.text.strip()

    return {
        "question": question,
        "answer": answer,
        "references": references,
        "stats": stats,
    }
    
def get_answer_and_stats(question_id, question):
    url = f"https://worker.hasanyone.com/api/v1/pqa-query/{question_id}"
    headers = {
        "accept": "*/*",
        "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
        "content-type": "application/json",
        # "x-query": question  # Include the question in the x-query header
    }

    response = requests.get(url, headers=headers)
    try:
        response_data = response.json()['response']
    except:
        print(response.text)
        print(response.status_code)
        print(response.headers)
        print(question_id)
        print(question)
        return None
        


    # Extract the answer
    answer = response_data.get("answer", "Answer not found") 

    # Extract the statistics
    stats = response_data.get("stats", "Stats not found")
    
    # work out if answer is yes or no
    # check if beginning with yes or no
    if answer['answer'].lower().startswith("yes"):
        yes_no = "yes"
    elif answer['answer'].lower().startswith("no"):
        yes_no = "no"
    else:
        raise Exception("Answer is not yes or no")

    return {
        "stats": stats,
        "question": question,
        "yes_no": yes_no,
        "answer": answer,
    }

# --- Main execution ---
questions = scrape_research_questions()
question_to_id = {question: get_question_id(question) for question in questions}
question_to_result = {}
for question in tqdm(questions, desc="Getting answers"):
    question_to_result[question] = get_answer_and_stats(question_to_id[question], question)


Getting answers:  36%|███▌      | 37/103 [00:34<00:58,  1.13it/s]


204
{'Date': 'Sat, 28 Dec 2024 20:45:16 GMT', 'Connection': 'keep-alive', 'access-control-allow-credentials': 'true', 'Report-To': '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=yzjlms%2FbZWVuV3sNJRMSPrzw5nAuB%2B6HR6WcludItpFIVLJT03xv2WBUFvqVVlzILujPQScepyvsTviO6egbF53kiEtiZn8Ut5lFgH4alG3FePc09WMAweMz2iUqHUQoRrBG0U1NGw%3D%3D"}],"group":"cf-nel","max_age":604800}', 'NEL': '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}', 'Vary': 'Accept-Encoding', 'Server': 'cloudflare', 'CF-RAY': '8f94649c6e3af656-LHR', 'alt-svc': 'h3=":443"; ma=86400', 'server-timing': 'cfL4;desc="?proto=TCP&rtt=13855&min_rtt=12825&rtt_var=5545&sent=4&recv=6&lost=0&retrans=0&sent_bytes=2845&recv_bytes=873&delivery_rate=224561&cwnd=218&unsent_bytes=0&cid=e7bb1da34c1dcad8&ts=772&x=0"'}
0191c3-11
Has anyone ever identified an aminoacyl tRNA synthetase that can charge tRNAs with 2-aminoisobutyric acid?


Getting answers:  60%|██████    | 62/103 [00:57<00:35,  1.14it/s]


204
{'Date': 'Sat, 28 Dec 2024 20:45:39 GMT', 'Connection': 'keep-alive', 'access-control-allow-credentials': 'true', 'Report-To': '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=ilcisNiXXdbJQRop%2FoMVE7V3rujhMC4m%2F76Y1vD%2FaQT4nsEQriaIiLtsffTkpPIzvqh0tPtl4CXTrOr3vX%2FIMEBITBfn9plNWPXDBxZMTA%2Fz8rwVgHDrsFo8qEEelRkNNHkU47IyOg%3D%3D"}],"group":"cf-nel","max_age":604800}', 'NEL': '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}', 'Vary': 'Accept-Encoding', 'Server': 'cloudflare', 'CF-RAY': '8f94652aa94188b6-LHR', 'alt-svc': 'h3=":443"; ma=86400', 'server-timing': 'cfL4;desc="?proto=TCP&rtt=12194&min_rtt=11414&rtt_var=4837&sent=4&recv=6&lost=0&retrans=0&sent_bytes=2846&recv_bytes=873&delivery_rate=252321&cwnd=244&unsent_bytes=0&cid=7adf4768a46d0989&ts=788&x=0"'}
0191c3-17
Has anyone made an LLM prompt set-up where one LLM pretends to be the user and the other the assistant and they accomplish a complex task via chatting?


Getting answers:  75%|███████▍  | 77/103 [01:10<00:24,  1.08it/s]


204
{'Date': 'Sat, 28 Dec 2024 20:45:52 GMT', 'Connection': 'keep-alive', 'access-control-allow-credentials': 'true', 'Report-To': '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=w5Y5z70wxWvEd7K3PlYG63YDYuizpbov8tocS%2FKlE8SUZHfL8m55iKxCNJ%2FAovEzu9h96cxLkz4Z%2BQHojIO%2Fsc6z2kQmspAksVkpXlZImRJ5ng0O2uyXBnDGFuN6oj5wV0Ki%2FeKFbw%3D%3D"}],"group":"cf-nel","max_age":604800}', 'NEL': '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}', 'Vary': 'Accept-Encoding', 'Server': 'cloudflare', 'CF-RAY': '8f9465802b8363b0-LHR', 'alt-svc': 'h3=":443"; ma=86400', 'server-timing': 'cfL4;desc="?proto=TCP&rtt=11977&min_rtt=11958&rtt_var=4524&sent=4&recv=6&lost=0&retrans=0&sent_bytes=2847&recv_bytes=872&delivery_rate=237682&cwnd=235&unsent_bytes=0&cid=3b214ffc38fcf074&ts=802&x=0"'}
652ef175
Has anyone shown that Corticotropin-releasing hormone decreases weight loss?


Getting answers: 100%|██████████| 103/103 [01:35<00:00,  1.08it/s]


In [130]:
get_question_id("Has anyone made a database of all the CRISPR-associated proteins and engineering Cas proteins that are used in CRISPR genome engineering? ")

'49da75c4'

In [129]:
[q for q in scrape_research_questions() if "CRISPR" in q]

['Has anyone built an AI tool for designing new CRISPR/Cas variants to minimize off target DNA cutting? ',
 'Has anyone explored the combination of photonics-based neural recording with CRISPR genome editing technologies to advance understanding of neural diseases',
 'Has anyone made a database of all the CRISPR-associated proteins and engineering Cas proteins that are used in CRISPR genome engineering? ']

In [109]:
# remove None results
import json
print(len(question_to_result))
question_to_result = {k: v for k, v in question_to_result.items() if v is not None}
print(len(question_to_result))
# save out to json
with open("hasanyone_results.json", "w") as f:
    json.dump(question_to_result, f)
    
# save out a version with only the question and yes / no answer as csv using pandas
import pandas as pd
keys = list(question_to_result.keys())
yes_no = [question_to_result[key]['yes_no'] for key in keys]
# randomly make 50% validation set, 50% test set
df = pd.DataFrame({"question": keys, "yes_no": yes_no})
df = df.sample(frac=1).reset_index(drop=True)
df['split'] = 'test'
df.iloc[:len(df)//2, -1] = 'validation'
df.to_csv("hasanyone_results_question_yes_no_split.csv", index=False)


100
100


In [123]:
import json
with open("/Users/oliverturnbull/Repositories/novelty_checker/results/evaluation_results_10_True_0.3_gemini_gemini-2.0-flash-exp_20241228-213229.json", "r") as f:
    results = json.load(f)
from pprint import pprint
pprint(results['detailed_results'][0]['full_explanation'])

('Several papers demonstrate that the proposed research question has been '
 'addressed. Specifically, the paper titled "Aerosol-Mediated Non-Viral Lung '
 'Gene Therapy: The Potential of Aminoglycoside-Based Cationic Liposomes" '
 '(URL: https://pubmed.ncbi.nlm.nih.gov/35056921/) confirms aerosol-mediated '
 'non-viral lung gene therapy has been demonstrated, citing progress using the '
 'lipidic formulation GL67A in cystic fibrosis treatment as an example. '
 'Furthermore, the paper "Aerosol Inhalation of Gene Delivery Therapy for '
 'Pulmonary..." (URL: https://pubmed.ncbi.nlm.nih.gov/39199292/) and a '
 'similarly titled paper (URL: https://www.mdpi.com/2218-273X/14/8/904) review '
 'aerosol gene delivery therapy for pulmonary diseases, noting various gene '
 'types and carriers and citing successes in diseases like SARS-CoV-2, cystic '
 'fibrosis, and lung cancer. Additionally, the paper "Aerosol Delivery of '
 'Synthetic mRNA to Vaginal Mucosa..." (URL: '
 'https://pmc.ncbi.nlm.n