In [1]:
%%capture
pip install ddgs


In [2]:
# --- CELL 1: SETUP & AUTHENTICATION (CORRECT ORDER) ---
import os
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

# 1. Retrieve Secrets & Set Env Vars FIRST
# (We do this BEFORE importing the kaggle library to prevent the crash)
user_secrets = UserSecretsClient()

try:
    # Get secrets
    gemini_key = user_secrets.get_secret("GOOGLE_API_KEY")
    k_user = user_secrets.get_secret("KAGGLE_USERNAME")
    k_key = user_secrets.get_secret("KAGGLE_KEY")

    # Set Environment Variables
    os.environ["KAGGLE_USERNAME"] = k_user
    os.environ["KAGGLE_KEY"] = k_key
    print("‚úÖ Environment variables set.")
    
    # 2. NOW it is safe to import the Kaggle API
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    k_api = KaggleApi()
    k_api.authenticate()
    print("‚úÖ Kaggle API Authenticated.")

    # 3. Authenticate Gemini
    genai.configure(api_key=gemini_key)
    print("‚úÖ Gemini API Authenticated.")

except Exception as e:
    print(f"‚ùå Error: {e}")

‚úÖ Environment variables set.
‚úÖ Kaggle API Authenticated.
‚úÖ Gemini API Authenticated.


In [3]:
# --- 1. IMPORTS & SETUP ---
import os
import json
import glob
import time
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from ddgs import DDGS
from kaggle.api.kaggle_api_extended import KaggleApi
from kaggle_secrets import UserSecretsClient

# --- 2. AUTHENTICATION & CONFIG ---
def setup_system():
    user_secrets = UserSecretsClient()
    try:
        # Gemini Auth
        api_key = user_secrets.get_secret("GOOGLE_API_KEY")
        genai.configure(api_key=api_key)
        
        # Kaggle Auth
        k_user = user_secrets.get_secret("KAGGLE_USERNAME")
        k_key = user_secrets.get_secret("KAGGLE_KEY")
        
        # Set Env Vars (Safest method)
        os.environ["KAGGLE_USERNAME"] = k_user
        os.environ["KAGGLE_KEY"] = k_key
        
        # Initialize API
        api = KaggleApi()
        api.authenticate()
        return api
    except Exception as e:
        print(f"‚ùå Auth Error: {e}")
        return None

k_api = setup_system()

# UPDATED: Use the stable 1.5 Flash model
model_name = 'gemini-2.5-flash' 

# Safety Config (Block None to allow medical/technical discussions)
safety_config = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

# --- HELPER: SAFE GENERATION ---
# --- HELPER: ROBUST STREAMING GENERATION ---
def safe_generate(model, prompt):
    """
    Uses streaming to prevent 504 Timeouts on long code generation tasks.
    """
    try:
        # Enable streaming
        response_iterator = model.generate_content(prompt, safety_settings=safety_config, stream=True)
        
        full_text = ""
        # Iterate through the chunks as they arrive
        for chunk in response_iterator:
            if chunk.text:
                full_text += chunk.text
        
        if not full_text:
            return "Error: Model returned empty response."
            
        return full_text

    except ValueError:
        return "Error: Model finished but generated no text (Silent Block)."
    except Exception as e:
        return f"Error: Generation failed with {str(e)}"

# --- 3. AGENT DEFINITIONS ---

class ProblemScoutAgent:
    def run(self, slug):
        print(f"üïµÔ∏è [Scout] Analyzing competition: {slug}...")
        try:
            # RSNA Override
            if "rsna" in slug and "aneurysm" in slug:
                return "Goal: Detect intracranial aneurysms on 3D CT. Metric: Weighted Log Loss. Data: 85GB 3D DICOM."
            
            # API Fetch
            comps = k_api.competitions_list(search=slug.split('-')[0])
            target = next((c for c in comps if c.ref == slug), None)
            if target:
                return f"Goal: {target.description[:500]}... Metric: {target.evaluationMetric}."
            return "Could not fetch official details via API."
        except Exception as e:
            return f"Scout Error: {e}"

class ForumScoutAgent:
    def run(self, slug):
        print(f"üì° [Forum] Scanning discussions for: {slug}...")
        readable_name = slug.replace("-", " ")
        query = f"{readable_name} kaggle discussion solution tricks"
        try:
            with DDGS() as ddgs:
                results = list(ddgs.text(query, max_results=5))
            
            if not results: return "No discussions found."
            summary = "\n".join([f"- {r['title']}: {r['body'][:200]}..." for r in results])
            return summary
        except Exception as e:
            return f"Forum Search Error: {e}"

class NotebookScoutAgent:
    def run(self, slug):
        print(f"üë®‚Äçüíª [Notebook] Hunting for top code...")
        try:
            kernels = []
            try: kernels = k_api.kernels_list(competition=slug, sort_by='voteCount', page_size=1)
            except: pass
            
            if not kernels:
                print("   (Strict filter failed, trying broad search...)")
                kernels = k_api.kernels_list(search=slug.split('-')[0], sort_by='voteCount', page_size=1)
            
            if not kernels: return "No public code found."
            
            top_k = kernels[0]
            print(f"   (Found: {top_k.title})")
            
            k_api.kernels_pull(top_k.ref, path="./downloaded_code")
            
            code_content = ""
            files = glob.glob("./downloaded_code/*")
            for f_path in files:
                if f_path.endswith(".py"):
                    with open(f_path,'r') as f: code_content += f.read()
                elif f_path.endswith(".ipynb"):
                    with open(f_path,'r') as f:
                        nb = json.load(f)
                        for c in nb['cells']: 
                            if c['cell_type']=='code': code_content += "".join(c['source']) + "\n"
            
            if len(code_content) < 50: return "Code was empty."
            return f"--- CODE FROM {top_k.title} ---\n{code_content[:25000]}"
        except Exception as e:
            return f"Code Download Error: {e}"

class StrategistAgent:
    def run(self, slug, goal, forum_intel, code_intel):
        print(f"üß† [Strategist] Formulating plan...")
        
        prompt = f"""
        Act as a Kaggle Grandmaster.
        Competition: {slug}
        
        1. OFFICIAL GOAL: {goal}
        2. COMMUNITY INTEL: {forum_intel}
        3. EXISTING CODE BASELINE: {code_intel[:2000]}... (truncated)
        
        Task: Write a 'Winning Strategy' report.
        - Critique the baseline.
        - Identify the specific model architecture we should build.
        - Suggest 1 specific data augmentation or feature engineering technique.
        - Define the validation strategy.
        """
        
        model = genai.GenerativeModel(model_name)
        return safe_generate(model, prompt)

class CodeGeneratorAgent:
    def run(self, strategy_report):
        print(f"üèóÔ∏è [Builder] Writing final solution.py...")
        
        prompt = f"""
        You are an expert Python Developer.
        
        Based on this strategy:
        {strategy_report}
        
        Write a COMPLETE, RUNNABLE 'main.py' script.
        - Include Dataset class, Model class, and Training Loop.
        - Use PyTorch.
        - Handle the specific data types mentioned (e.g. Images, CSVs).
        """
        
        model = genai.GenerativeModel(model_name)
        return safe_generate(model, prompt)

# --- 4. THE ORCHESTRATOR ---

class KaggleCommandSystem:
    def __init__(self):
        self.scout = ProblemScoutAgent()
        self.forum = ForumScoutAgent()
        self.notebook = NotebookScoutAgent()
        self.strategist = StrategistAgent()
        self.builder = CodeGeneratorAgent()
        
    def execute(self, competition_slug):
        print(f"üöÄ STARTING KAGGLE COMMAND FOR: {competition_slug}\n" + "="*50)
        
        goal_data = self.scout.run(competition_slug)
        forum_data = self.forum.run(competition_slug)
        code_data = self.notebook.run(competition_slug)
        
        strategy = self.strategist.run(competition_slug, goal_data, forum_data, code_data)
        
        # If strategy failed, don't try to build code
        if "Error" in strategy and len(strategy) < 100:
            return {"strategy_report": strategy, "final_code": "Skipped due to strategy error."}
            
        final_code = self.builder.run(strategy)
        
        return {
            "strategy_report": strategy,
            "final_code": final_code
        }

# --- 5. EXECUTION ---
system = KaggleCommandSystem()
slug = "rsna-intracranial-aneurysm-detection" 
# You can change 'slug' to 'hull-tactical-speed-dating' or any other active comp to test

try:
    result = system.execute(slug)
    
    from IPython.display import Markdown, display
    print("\n" + "="*50)
    display(Markdown(f"## üìÑ STRATEGY REPORT\n{result['strategy_report']}"))
    print("\n" + "="*50)
    display(Markdown(f"## üêç GENERATED CODE\n{result['final_code']}"))
    
    with open("submission.py", "w") as f:
        f.write(result['final_code'])
    print("‚úÖ Code saved to 'submission.py'")

except Exception as e:
    print(f"‚ùå Fatal System Error: {e}")

üöÄ STARTING KAGGLE COMMAND FOR: rsna-intracranial-aneurysm-detection
üïµÔ∏è [Scout] Analyzing competition: rsna-intracranial-aneurysm-detection...
üì° [Forum] Scanning discussions for: rsna-intracranial-aneurysm-detection...
üë®‚Äçüíª [Notebook] Hunting for top code...
   (Found: RSNA Aneurysm Detection Demo Submission)
üß† [Strategist] Formulating plan...
üèóÔ∏è [Builder] Writing final solution.py...



## üìÑ STRATEGY REPORT
Alright team, let's dissect this RSNA Intracranial Aneurysm Detection challenge. This is a classic 3D medical image problem, and getting it right means paying meticulous attention to data handling, robust model design, and a bulletproof validation strategy. The 85GB data size is substantial, indicating a need for efficient processing.

Here's my winning strategy report:

---

### Winning Strategy Report: RSNA Intracranial Aneurysm Detection

As Kaggle Grandmasters, our goal is to deliver a robust, high-performing solution that generalizes well to unseen patient data. The Weighted Log Loss metric demands not just accuracy but well-calibrated probability predictions across multiple aneurysm locations.

#### 1. Critique of the Baseline

The provided `kaggle_evaluation.rsna_inference_server` baseline is, frankly, not a baseline in the machine learning sense. It's a **placeholder demonstrating the submission system's API**.

*   **No Predictive Logic:** It contains no actual code for detecting aneurysms or processing imaging data beyond iterating through file paths.
*   **No Image Processing:** It doesn't load pixel data, perform Hounsfield Unit (HU) normalization, re-sample to a common spacing, or stack DICOM slices into a 3D volume. These are fundamental steps for any 3D CT analysis.
*   **No Model:** There's no neural network, no classical ML algorithm, nor any inference logic whatsoever.
*   **Purpose:** Its sole purpose is to illustrate the `predict(series_path)` function signature and the expected `pl.DataFrame` or `pd.DataFrame` output format with the `LABEL_COLS`.

**Conclusion:** We must build the entire machine learning pipeline from scratch, starting with robust data loading and pre-processing, followed by a sophisticated 3D deep learning model. The baseline only serves as a guide for the *inference environment interaction*.

#### 2. Specific Model Architecture

Given the task of detecting and localizing aneurysms across multiple specific anatomical locations in 3D CT scans, we need a powerful 3D Convolutional Neural Network (CNN) architecture. The multi-label nature (13 specific locations + "Aneurysm Present") necessitates a model capable of learning distinct features for each region.

**Recommended Architecture: 3D ResNeXt-50 with Multi-Heads**

*   **Base Architecture:** A **3D ResNeXt-50** will serve as our primary feature extractor. ResNeXt models are known for their strong performance, efficiency, and ability to learn rich, diverse features through "cardinality" (group convolutions). Adapting it to 3D allows it to naturally process volumetric data.
*   **Why 3D ResNeXt-50?**
    *   **Volumetric Understanding:** 3D convolutions are essential for capturing spatial relationships and contextual information within the CT volume, crucial for aneurysm detection which relies on shape, size, and surrounding vessel structures.
    *   **Performance:** ResNeXt variants have consistently shown top performance in medical imaging tasks, balancing depth, width, and computational cost effectively.
    *   **Feature Richness:** The grouped convolutions improve feature representation without a proportional increase in parameters compared to standard convolutions.
*   **Input & Preprocessing:**
    *   All DICOM slices for a `SeriesInstanceUID` will be stacked into a single 3D volume.
    *   The volume will be resampled to a consistent isotropic spacing (e.g., `1.0 x 1.0 x 1.0 mm¬≥`) to standardize input resolution.
    *   Hounsfield Units (HU) will be normalized and clipped (e.g., `[-100, 250]` for brain tissue, then min-max scaled to `[0, 1]`) to focus on relevant tissue and make intensities consistent.
    *   Input volumes will be center-cropped or padded to a fixed size (e.g., `128x128x128` or `256x256x256`) to fit GPU memory.
*   **Output Heads:** The model will branch into multiple independent classification heads after the main ResNeXt backbone.
    *   One sigmoid-activated output neuron for each of the 13 specific aneurysm location labels.
    *   One additional sigmoid-activated output neuron for the global `Aneurysm Present` label.
    *   Each head will predict a probability, allowing for multi-label classification.
*   **Loss Function:** A combination of Binary Cross-Entropy (BCE) loss for each label, potentially weighted to account for class imbalance (especially for the rare aneurysm types).

**Framework:** We will leverage `MONAI` (Medical Open Network for AI) for its optimized 3D building blocks, pre-trained weights (if available, or for related tasks), and robust data loading/augmentation pipelines, accelerating development and ensuring best practices.

#### 3. Specific Data Augmentation / Feature Engineering Technique

**Technique: 3D Elastic Deformations**

*   **Description:** Elastic deformations are a powerful geometric augmentation technique where a dense displacement field is applied to the image (and corresponding labels/masks, if applicable). This simulates realistic non-rigid transformations that occur in biological tissues, such as variations in patient posture, anatomical variability, or subtle organ motion.
*   **Implementation:** We will generate a random displacement field (e.g., using Gaussian smoothing of random vectors) and then use interpolation to apply these deformations to the 3D CT volume.
*   **Why it's crucial here:**
    1.  **Mimics Anatomical Variability:** Brain structures and blood vessel paths naturally vary from person to person. Elastic deformations introduce these subtle, non-linear shape changes, making the model more robust to inter-patient anatomical differences.
    2.  **Improves Generalization:** By exposing the model to slightly warped versions of existing aneurysms and healthy structures, it learns to identify aneurysms based on their intrinsic features rather than specific spatial configurations tied to the training data.
    3.  **Data Augmentation for Limited Data:** Medical imaging datasets, while large in GB, can be limited in terms of unique aneurysm cases. Elastic deformations effectively increase the diversity of our training data without requiring more raw scans.

We will combine this with other standard augmentations like random rotations, translations, scaling, and intensity adjustments (e.g., random brightness/contrast, Gaussian noise).

#### 4. Validation Strategy

**Strategy: Patient-Stratified K-Fold Cross-Validation**

*   **Core Principle:** It is absolutely critical to split data at the *patient level*. Any series from the same patient must belong exclusively to either the training or validation set. Failing to do so would lead to data leakage, where the model might "memorize" patient-specific characteristics rather than generalize to new patients, resulting in overly optimistic validation scores that don't reflect real-world performance.
*   **Steps:**
    1.  **Identify Unique Patients:** Extract all unique `PatientID`s from the training metadata.
    2.  **Stratification Target:** The `Aneurysm Present` label is the most critical for stratification, as aneurysm cases are likely rare. For each `PatientID`, determine if *any* of their associated series contain an aneurysm (`Aneurysm Present = 1`). We can create a stratification target that balances the number of "aneurysm-positive" and "aneurysm-negative" patients across folds.
    3.  **K-Fold Split:** Divide the unique `PatientID`s into `K` (e.g., K=5) folds, ensuring that the distribution of aneurysm-positive patients is as even as possible across all folds.
    4.  **Training and Validation:** For each fold `i` (from 1 to K):
        *   **Training Set:** All series belonging to patients in the `K-1` other folds.
        *   **Validation Set:** All series belonging to patients in fold `i`.
    5.  **Out-Of-Fold (OOF) Predictions:** During training, generate predictions for the validation set of each fold. These OOF predictions can be concatenated to form a complete set of predictions for the entire training dataset, which is invaluable for:
        *   Estimating true model performance.
        *   Ensembling multiple models (e.g., blending OOF predictions from different folds/models).
        *   Post-processing and calibration of probabilities.
    6.  **Ensembling (Optional but Recommended):** Train multiple models (different seeds, slightly different architectures, or data splits) and average their predictions on the test set. OOF predictions help confirm which models blend well.

This strategy will provide a robust estimate of our model's performance on unseen patients and is the gold standard for medical image competitions.

---

This comprehensive strategy, from data handling and model architecture to augmentation and rigorous validation, sets us on the path to a top solution. Iterative refinement based on validation results will be key. Let's get to work!




## üêç GENERATED CODE
Error: Generation failed with 500 Received RST_STREAM with error code 2

‚úÖ Code saved to 'submission.py'
