<a href="https://colab.research.google.com/github/RJaeschke1982/Codex-Playground/blob/main/EBM_Compass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#@markdown # EBM Compass v9.2 (Raport HTML + Synteza)
#@markdown **Instrukcja:** Wpisz temat, dostosuj opcje i uruchom komórkę. Głównym wynikiem będzie plik `EBM_Compass_Raport.html`.
#@markdown ---
#@markdown ### 1. Główne Zapytanie
TEMAT_BADAWCZY = "social jet lag"  #@param {type:"string"}
#@markdown ### 2. Ustawienia AI i Pobierania
#@markdown - **Próg trafności:** Artykuły z oceną równą lub wyższą trafią do raportu.
#@markdown - **Limit rekordów:** Ogranicza liczbę artykułów pobieranych z PubMed.
PRÓG_TRAFNOŚCI = 7  #@param {type:"slider", min:1, max:10, step:1}
MAX_REKORDOW_DO_POBRANIA = 500 #@param {type:"number"}
MODEL_LLM = "gpt-4o-mini"  #@param ["gpt-4o", "gpt-4o-mini"]
ŹRÓDŁO_KLUCZA = "secrets"  #@param ["secrets", "env", "manual"]
#@markdown ### 3. Filtry i Raportowanie
#@markdown - **Typ publikacji:** Filtruje wyniki wg typu badania.
#@markdown - **Liczba art. w raporcie:** Ile najlepszych artykułów pokazać w finalnym raporcie HTML.
TYP_PUBLIKACJI = "Tylko przeglądowe (w tym narracyjne)" #@param ["Wszystkie", "Tylko przeglądowe (Review)", "Tylko przeglądowe (w tym narracyjne)", "Tylko badania kliniczne (RCT)"]
LICZBA_ART_W_RAPORCIE = 25 #@param {type:"number"}

# ===================================================================
# KROK 0: INSTALACJA I IMPORTY
# ===================================================================
print("--- [SETUP] Instalacja i konfiguracja... ---")
!pip install -q "openai>=1.40.0" biopython pandas tqdm tenacity

import sys, io, json, os, re, textwrap, time, pickle
import pandas as pd
from IPython.display import HTML, display, clear_output
from tenacity import retry, stop_after_attempt, wait_exponential
from getpass import getpass
from tqdm.notebook import tqdm
from Bio import Entrez, Medline
from openai import OpenAI

# ===================================================================
# KROK 1: KLUCZ API I ŚRODOWISKO
# ===================================================================
IN_COLAB = 'google.colab' in sys.modules
try:
    if ŹRÓDŁO_KLUCZA == "secrets":
        if not IN_COLAB: raise RuntimeError("Wybrano 'secrets', ale nie wykryto Colab.")
        from google.colab import userdata
        OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    elif ŹRÓDŁO_KLUCZA == "env":
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    elif ŹRÓDŁO_KLUCZA == "manual":
        OPENAI_API_KEY = getpass("Wklej klucz OpenAI API: ")
    if not OPENAI_API_KEY: raise ValueError("Nie udało się odczytać klucza OpenAI API.")
    client = OpenAI(api_key=OPENAI_API_KEY)
    print("✅ [SETUP] Klucz API OpenAI gotowy.")
except Exception as e: print(f"❌ [SETUP] KRYTYCZNY BŁĄD: {e}"); sys.exit()

Entrez.email = "user@example.com"
print("--- [SETUP] Środowisko gotowe. ---")

# ===================================================================
# KROK 2: FUNKCJE RDZENIOWE
# ===================================================================
FRAMEWORK_DEFINITIONS = {
    "PICO": {"description": "Effectiveness of interventions", "components": {"P": "Patient", "I": "Intervention", "C": "Comparison", "O": "Outcome"}},
    "PEO": {"description": "Risk factors and prognosis", "components": {"P": "Population", "E": "Exposure", "O": "Outcome"}},
}

@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
def safe_openai_call(client: OpenAI, model: str, messages, response_format=None, temperature: float = 0.0, max_tokens: int | None = None):
    params = {"model": model, "messages": messages, "temperature": temperature}
    if response_format: params["response_format"] = response_format
    if max_tokens is not None: params["max_tokens"] = int(max_tokens)
    return client.chat.completions.create(**params)

def translate_topic_to_english(topic_pl: str, client: OpenAI, model: str) -> str:
    prompt = f'Translate the following research topic into concise, formal scientific English for PubMed. Return JSON with a single key "topic_en".\nSource (PL): {topic_pl}'
    try:
        r = safe_openai_call(client, model, [{"role": "system", "content": "You are a professional biomedical translator."}, {"role": "user", "content": prompt}], response_format={"type": "json_object"}, max_tokens=200)
        return json.loads(r.choices[0].message.content).get("topic_en", "").strip()
    except Exception: return topic_pl

def get_query_strategy(topic_en: str, client: OpenAI, model: str):
    print("\n--- [ETAP A: WYBÓR SCHEMATU] ---")
    framework_options = {k: {"description": v["description"]} for k, v in FRAMEWORK_DEFINITIONS.items()}
    framework_options["CONCEPT"] = {"description": "A broad search for a general concept or overview"}
    options = "\n".join([f'- {k}: {v["description"]}' for k, v in framework_options.items()])
    prompt1 = f'Analyze the topic: "{topic_en}". Select the best framework from the list.\n{options}\nRespond in JSON: {{"framework": "KEY"}}.'
    try:
        r1 = safe_openai_call(client, model, [{"role": "system", "content": "You are a research methodology expert."}, {"role": "user", "content": prompt1}], response_format={"type": "json_object"}, max_tokens=100)
        choice = json.loads(r1.choices[0].message.content).get("framework")
        if not choice or choice not in framework_options: raise ValueError(f"Unknown framework: {choice}")
        print(f"✅ Wybrano schemat: {choice}")
    except Exception as e: print(f"❌ BŁĄD: {e}"); return None, None

    print(f"\n--- [ETAP B: GENEROWANIE SŁÓW KLUCZOWYCH] ---")
    try:
        if choice in ["PICO", "PEO"]:
            components = ", ".join([f'{k} ({v})' for k, v in FRAMEWORK_DEFINITIONS[choice]["components"].items()])
            prompt2 = f'Topic: "{topic_en}". Framework: {choice}. Generate English PubMed search terms for components: {components}. Return JSON with component abbreviations as keys and lists of MeSH/tiab terms as values.'
            r2 = safe_openai_call(client, model, [{"role": "system", "content": "You are a PubMed search strategy expert."}, {"role": "user", "content": prompt2}], response_format={"type": "json_object"}, max_tokens=1200)
            return choice, json.loads(r2.choices[0].message.content)
        elif choice == "CONCEPT":
            prompt_concept = f'Topic is a general concept: "{topic_en}". Extract the core scientific term(s) for a broad PubMed title/abstract search. Include common variations. Return a JSON object with a single key "terms" containing a list of strings.'
            r_concept = safe_openai_call(client, model, [{"role": "system", "content": "You are a PubMed search expert for concepts."}, {"role": "user", "content": prompt_concept}], response_format={"type": "json_object"}, max_tokens=300)
            return choice, json.loads(r_concept.choices[0].message.content)
    except Exception as e: print(f"❌ BŁĄD: {e}"); return choice, None

def build_query(keywords: dict, typ_publikacji: str) -> str:
    if not keywords: return ""
    base_query = ""
    if "terms" in keywords:
        base_query = " OR ".join([f'"{term}"[tiab]' for term in keywords.get("terms", [])])
    else:
        groups = [f"({' OR '.join(terms)})" for _, terms in keywords.items() if terms]
        if groups: base_query = ' AND '.join(groups)
    if not base_query: return ""
    filters = ["(english[Language])"]
    if typ_publikacji == "Tylko przeglądowe (Review)": filters.append("(Review[Publication Type])")
    elif typ_publikacji == "Tylko przeglądowe (w tym narracyjne)": filters.append('(Review[Publication Type] OR "narrative review"[Title/Abstract])')
    elif typ_publikacji == "Tylko badania kliniczne (RCT)": filters.append("(Clinical Trial[Publication Type] OR Randomized Controlled Trial[Publication Type])")
    return f"({base_query}) AND {' AND '.join(filters)}"

def search_and_parse_pubmed(query: str, max_records: int):
    try:
        print(f"\n--- [ETAP 1: IDENTYFIKACJA (limit: {max_records})] ---")
        h_search = Entrez.esearch(db="pubmed", term=query, retmax=str(max_records))
        record = Entrez.read(h_search); h_search.close()
        id_list = record.get("IdList", [])
        if not id_list: print("✅ Znaleziono 0 publikacji."); return None
        print(f"✅ Znaleziono {len(id_list)} publikacji. Pobieranie...")
        all_records = []
        with tqdm(total=len(id_list), desc="Pobieranie (PubMed)") as pbar:
            for start in range(0, len(id_list), 500):
                batch = id_list[start:start+500]
                h_fetch = Entrez.efetch(db="pubmed", id=batch, rettype="medline", retmode="text")
                all_records.extend(list(Medline.parse(io.StringIO(h_fetch.read()))))
                pbar.update(len(batch)); time.sleep(0.34)
        return pd.DataFrame([{'PMID': r.get('PMID'), 'Title': r.get('TI'), 'Abstract': r.get('AB'), 'Year': int(str(r.get('DP', '1900'))[:4])} for r in all_records if r.get('PMID')])
    except Exception as e: print(f"❌ BŁĄD komunikacji z PubMed: {e}"); return None

def process_and_score_article(row, topic_en: str, client: OpenAI, model: str):
    prompt = f'''Analyze the abstract for the topic: "{topic_en}".
    Respond in JSON with these keys:
    1. "relevance_score": An integer from 1 (not relevant) to 10 (perfectly relevant).
    2. "study_type": The type of study (e.g., "RCT", "Systematic Review", "Observational").
    3. "key_takeaway_pl": In ONE concise POLISH sentence, what is the main conclusion of this study?
    4. "importance_pl": In 2-4 POLISH words, why is this study significant (e.g., "Duża próba", "Pierwsze badanie", "Przegląd systematyczny").'''
    try:
        content = f"TITLE: {row['Title']}\nABSTRACT: {row['Abstract']}\n\nINSTRUCTIONS:\n{prompt}"
        r = safe_openai_call(client, model, [{"role": "system", "content": "You are a precise, ADHD-friendly research analyst providing concise insights in Polish."}, {"role": "user", "content": content}], response_format={"type": "json_object"}, max_tokens=500)
        data = json.loads(r.choices[0].message.content)
        return {'PMID': row['PMID'], **data}
    except Exception:
        return {'PMID': row['PMID'], 'relevance_score': 0, 'study_type': 'N/A', 'key_takeaway_pl': 'Błąd analizy AI', 'importance_pl': 'Błąd'}

def run_comprehensive_screening(df: pd.DataFrame, topic_en: str, client: OpenAI, model: str) -> pd.DataFrame:
    df_clean = df.dropna(subset=['Abstract', 'Title']).copy()
    df_clean = df_clean[df_clean['Abstract'].str.strip() != '']
    if df_clean.empty: print("Brak abstraktów do analizy."); return pd.DataFrame()
    print("\n--- [ETAP 2: OCENA AI] ---")
    enriched_data = []
    with tqdm(total=len(df_clean), desc="🧠 Ocena AI") as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=10) as ex:
        futures = {ex.submit(process_and_score_article, row, topic_en, client, model) for _, row in df_clean.iterrows()}
        for fut in concurrent.futures.as_completed(futures):
            enriched_data.append(fut.result()); pbar.update(1)
    return pd.merge(df_clean, pd.DataFrame(enriched_data), on='PMID', how='left')

def add_reliability_score(df: pd.DataFrame) -> pd.DataFrame:
    if 'study_type' not in df.columns: return df
    def get_score(st):
        st = str(st).lower()
        if "meta-analysis" in st or "systematic review" in st: return 5
        if "guideline" in st or "rct" in st or "randomized" in st: return 4
        if "observational" in st or "cohort" in st or "case-control" in st: return 3
        if "review" in st: return 2
        return 1
    df['reliability_score'] = df['study_type'].apply(get_score)
    return df

def generate_quick_synthesis(df_top: pd.DataFrame, topic_pl: str, client: OpenAI, model: str) -> str:
    if df_top.empty: return "Brak wystarczających danych do automatycznej syntezy."
    print("\n--- [GENEROWANIE SZYBKIEJ SYNTEZY] ---")
    context = "\n".join([f"- {row['key_takeaway_pl']} (Znaczenie: {row['importance_pl']}, Typ: {row['study_type']})" for _, row in df_top.head(5).iterrows()])
    prompt = f"Na podstawie poniższych kluczowych wniosków z najważniejszych artykułów na temat '{topic_pl}', napisz zwięzłe, 3-4 zdaniowe podsumowanie (executive summary) w języku polskim.\n\nKluczowe wnioski:\n{context}"
    try:
        r = safe_openai_call(client, model, [{"role": "system", "content": "Jesteś redaktorem naukowym, który tworzy zwięzłe syntezy."}, {"role": "user", "content": prompt}], max_tokens=400)
        return r.choices[0].message.content or ""
    except Exception as e: return f"Błąd podczas generowania syntezy: {e}"

# ===================================================================
# POPRAWIONA FUNKCJA GENEROWANIA RAPORTU
# ===================================================================
def generate_html_report(df_report: pd.DataFrame, topic_pl: str, query: str, synthesis: str):
    # ZMIANA: Podwójne nawiasy klamrowe {{ }} w sekcji <style>, aby uniknąć konfliktu z .format()
    html_template = """
    <!DOCTYPE html><html lang="pl"><head><meta charset="UTF-8"><title>Raport EBM Compass</title><style>
    body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; color: #333; max-width: 900px; margin: 20px auto; background-color: #f9f9f9; }}
    .header {{ text-align: center; border-bottom: 2px solid #eee; padding-bottom: 10px; }}
    .header h1 {{ margin: 0; }} .header p {{ color: #666; }}
    .synthesis {{ background-color: #eef7ff; border: 1px solid #d0eaff; border-radius: 8px; padding: 15px; margin: 20px 0; }}
    .article {{ background-color: #fff; border: 1px solid #ddd; border-radius: 8px; padding: 15px; margin-bottom: 15px; }}
    .article h3 {{ margin-top: 0; }} .article h3 a {{ color: #0056b3; text-decoration: none; }}
    .meta {{ font-size: 0.9em; color: #555; margin-bottom: 10px; }} .meta span {{ margin-right: 15px; }}
    .takeaway, .importance {{ margin-bottom: 5px;}}
    .footer {{ text-align: center; font-size: 0.8em; color: #888; margin-top: 20px; }}
    </style></head><body>
    <div class="header"><h1>Raport EBM Compass</h1><p>Temat: <strong>{topic}</strong></p></div>
    <div class="synthesis"><h3>🚀 Szybka Synteza (na podstawie 5 topowych artykułów)</h3><p>{synthesis}</p></div>
    {articles_html}
    <div class="footer"><p>Wygenerowano {date}. Kwerenda: <code>{query}</code></p></div>
    </body></html>
    """
    articles_html = ""
    for _, row in df_report.iterrows():
        articles_html += f"""
        <div class="article">
            <h3><a href="https://pubmed.ncbi.nlm.nih.gov/{row['PMID']}/" target="_blank">{row['Title']}</a></h3>
            <div class="meta">
                <span>🔬 <b>Typ:</b> {row['study_type']}</span>
                <span>🏆 <b>Wiarygodność:</b> {row['reliability_score']}/5</span>
                <span>🎯 <b>Trafność AI:</b> {row.get('relevance_score', 'N/A')}/10</span>
                <span>📅 <b>Rok:</b> {row['Year']}</span>
            </div>
            <div class="takeaway">🔑 <b>Kluczowy wniosek:</b> {row.get('key_takeaway_pl', 'Brak')}</div>
            <div class="importance">💡 <b>Znaczenie badania:</b> {row.get('importance_pl', 'Brak')}</div>
        </div>
        """
    report_html = html_template.format(topic=topic_pl, synthesis=synthesis, articles_html=articles_html, date=time.strftime("%Y-%m-%d"), query=query)
    with open("EBM_Compass_Raport.html", "w", encoding="utf-8") as f: f.write(report_html)
    print("✅ Wygenerowano raport HTML: EBM_Compass_Raport.html")

# ===================================================================
# WYKONANIE GŁÓWNEGO POTOKU
# ===================================================================
if not TEMAT_BADAWCZY.strip():
    print("❌ BŁĄD: Pole 'TEMAT_BADAWCZY' jest puste.")
else:
    TOPIC_EN = translate_topic_to_english(TEMAT_BADAWCZY, client, MODEL_LLM)
    framework, keywords = get_query_strategy(TOPIC_EN, client, MODEL_LLM)
    if framework and keywords:
        finalna_kwerenda = build_query(keywords, TYP_PUBLIKACJI)
        clear_output(wait=True)
        while True:
            prompt_text = textwrap.dedent(f"""
            ================ KROK KONTROLNY: WERYFIKACJA STRATEGII ================
            Temat (PL): {TEMAT_BADAWCZY} | Filtr: {TYP_PUBLIKACJI} | Limit: {MAX_REKORDOW_DO_POBRANIA}
            Bieżąca kwerenda (PubMed): {finalna_kwerenda}
            ======================================================================
            """)
            print(prompt_text)
            decyzja = input("--> Wpisz 'tak', aby zaakceptować, lub wklej nową wersję kwerendy: ")
            if decyzja.lower().strip() in ['tak', 't', 'yes', 'y']: print("\n✅ Strategia zaakceptowana."); break
            elif decyzja.strip(): finalna_kwerenda = decyzja.strip(); clear_output(wait=True); print("🔄 Kwerenda zaktualizowana.")
            else: clear_output(wait=True)

        df_wszystkie = search_and_parse_pubmed(finalna_kwerenda, MAX_REKORDOW_DO_POBRANIA)
        if df_wszystkie is not None and not df_wszystkie.empty:
            df_ocenione = run_comprehensive_screening(df_wszystkie, TOPIC_EN, client, MODEL_LLM)
            df_ocenione_z_wiar = add_reliability_score(df_ocenione)
            if not df_ocenione_z_wiar.empty:
                print(f"\n--- [ETAP 3: SELEKCJA I SORTOWANIE] ---")
                df_do_raportu = df_ocenione_z_wiar[df_ocenione_z_wiar['relevance_score'] >= PRÓG_TRAFNOŚCI].copy()
                df_do_raportu.sort_values(by=['reliability_score', 'relevance_score'], ascending=[False, False], inplace=True)
                df_final_report = df_do_raportu.head(LICZBA_ART_W_RAPORCIE)

                if not df_final_report.empty:
                    szybka_synteza = generate_quick_synthesis(df_final_report, TEMAT_BADAWCZY, client, MODEL_LLM)
                    generate_html_report(df_final_report, TEMAT_BADAWCZY, finalna_kwerenda, szybka_synteza)
                    df_final_report.to_csv("wyniki_do_raportu.csv", index=False, encoding='utf-8-sig')
                    display(HTML('<a href="EBM_Compass_Raport.html" target="_blank" style="font-size: 1.2em; font-weight: bold; color: #0056b3; display: block; text-align: center; margin-top: 20px;">➡️ Otwórz wygenerowany Raport HTML ⬅️</a>'))
                else:
                    print("\nBrak artykułów spełniających próg do wygenerowania raportu. Rozważ obniżenie progu.")
            else:
                print("\nNie udało się ocenić żadnego artykułu.")


Temat (PL): social jet lag | Filtr: Tylko przeglądowe (w tym narracyjne) | Limit: 500
Bieżąca kwerenda (PubMed): ((adolescents OR young adults OR adults OR students OR workers OR shift workers OR teenagers) AND (social jet lag OR circadian rhythm disruption OR sleep pattern OR sleep timing OR sleep deprivation OR chronotype OR weekend sleep OR sleep-wake cycle) AND (sleep quality OR mental health OR cognitive performance OR mood disorders OR fatigue OR daytime sleepiness OR academic performance OR physical health)) AND (english[Language]) AND (Review[Publication Type] OR "narrative review"[Title/Abstract])

--> Wpisz 'tak', aby zaakceptować, lub wklej nową wersję kwerendy: tak

✅ Strategia zaakceptowana.

--- [ETAP 1: IDENTYFIKACJA (limit: 500)] ---
✅ Znaleziono 500 publikacji. Pobieranie...


Pobieranie (PubMed):   0%|          | 0/500 [00:00<?, ?it/s]


--- [ETAP 2: OCENA AI] ---


🧠 Ocena AI:   0%|          | 0/499 [00:00<?, ?it/s]


--- [ETAP 3: SELEKCJA I SORTOWANIE] ---

--- [GENEROWANIE SZYBKIEJ SYNTEZY] ---
✅ Wygenerowano raport HTML: EBM_Compass_Raport.html
