In [1]:
%%capture
pip install ddgs


In [2]:
# --- CELL 1: SETUP & AUTHENTICATION (CORRECT ORDER) ---
import os
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

# 1. Retrieve Secrets & Set Env Vars FIRST
# (We do this BEFORE importing the kaggle library to prevent the crash)
user_secrets = UserSecretsClient()

try:
    # Get secrets
    gemini_key = user_secrets.get_secret("GOOGLE_API_KEY")
    k_user = user_secrets.get_secret("KAGGLE_USERNAME")
    k_key = user_secrets.get_secret("KAGGLE_KEY")

    # Set Environment Variables
    os.environ["KAGGLE_USERNAME"] = k_user
    os.environ["KAGGLE_KEY"] = k_key
    print("‚úÖ Environment variables set.")
    
    # 2. NOW it is safe to import the Kaggle API
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    k_api = KaggleApi()
    k_api.authenticate()
    print("‚úÖ Kaggle API Authenticated.")

    # 3. Authenticate Gemini
    genai.configure(api_key=gemini_key)
    print("‚úÖ Gemini API Authenticated.")

except Exception as e:
    print(f"‚ùå Error: {e}")

‚úÖ Environment variables set.
‚úÖ Kaggle API Authenticated.
‚úÖ Gemini API Authenticated.


In [3]:
# --- 1. IMPORTS & SETUP ---
import os
import json
import glob
import time
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from ddgs import DDGS
from kaggle.api.kaggle_api_extended import KaggleApi
from kaggle_secrets import UserSecretsClient

# --- 2. AUTHENTICATION & CONFIG ---
def setup_system():
    user_secrets = UserSecretsClient()
    try:
        # Gemini Auth
        api_key = user_secrets.get_secret("GOOGLE_API_KEY")
        genai.configure(api_key=api_key)
        
        # Kaggle Auth
        k_user = user_secrets.get_secret("KAGGLE_USERNAME")
        k_key = user_secrets.get_secret("KAGGLE_KEY")
        
        # Set Env Vars (Safest method)
        os.environ["KAGGLE_USERNAME"] = k_user
        os.environ["KAGGLE_KEY"] = k_key
        
        # Initialize API
        api = KaggleApi()
        api.authenticate()
        return api
    except Exception as e:
        print(f"‚ùå Auth Error: {e}")
        return None

k_api = setup_system()

# UPDATED: Use the stable 1.5 Flash model
model_name = 'gemini-2.5-flash' 

# Safety Config (Block None to allow medical/technical discussions)
safety_config = {
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

# --- HELPER: SAFE GENERATION ---
# --- HELPER: ROBUST GENERATION WITH EXTENDED TIMEOUT ---
def safe_generate(model, prompt):
    """
    Uses standard generation but with a 10-minute timeout to allow 
    for long code blocks without triggering 504 or RST_STREAM errors.
    """
    try:
        # Request a massive timeout (600s) so it doesn't give up
        response = model.generate_content(
            prompt, 
            safety_settings=safety_config,
            request_options={'timeout': 3600} 
        )
        
        if response.text:
            return response.text
        else:
            return "Error: Model returned empty response."

    except Exception as e:
        return f"Error: Generation failed with {str(e)}"

# --- 3. AGENT DEFINITIONS ---

class ProblemScoutAgent:
    def run(self, slug):
        print(f"üïµÔ∏è [Scout] Analyzing competition: {slug}...")
        try:
            # RSNA Override
            if "rsna" in slug and "aneurysm" in slug:
                return "Goal: Detect intracranial aneurysms on 3D CT. Metric: Weighted Log Loss. Data: 85GB 3D DICOM."
            
            # API Fetch
            comps = k_api.competitions_list(search=slug.split('-')[0])
            target = next((c for c in comps if c.ref == slug), None)
            if target:
                return f"Goal: {target.description[:500]}... Metric: {target.evaluationMetric}."
            return "Could not fetch official details via API."
        except Exception as e:
            return f"Scout Error: {e}"

class ForumScoutAgent:
    def run(self, slug):
        print(f"üì° [Forum] Scanning discussions for: {slug}...")
        readable_name = slug.replace("-", " ")
        query = f"{readable_name} kaggle discussion solution tricks"
        try:
            with DDGS() as ddgs:
                results = list(ddgs.text(query, max_results=5))
            
            if not results: return "No discussions found."
            summary = "\n".join([f"- {r['title']}: {r['body'][:200]}..." for r in results])
            return summary
        except Exception as e:
            return f"Forum Search Error: {e}"

class NotebookScoutAgent:
    def run(self, slug):
        print(f"üë®‚Äçüíª [Notebook] Hunting for top code...")
        try:
            kernels = []
            try: kernels = k_api.kernels_list(competition=slug, sort_by='voteCount', page_size=1)
            except: pass
            
            if not kernels:
                print("   (Strict filter failed, trying broad search...)")
                kernels = k_api.kernels_list(search=slug.split('-')[0], sort_by='voteCount', page_size=1)
            
            if not kernels: return "No public code found."
            
            top_k = kernels[0]
            print(f"   (Found: {top_k.title})")
            
            k_api.kernels_pull(top_k.ref, path="./downloaded_code")
            
            code_content = ""
            files = glob.glob("./downloaded_code/*")
            for f_path in files:
                if f_path.endswith(".py"):
                    with open(f_path,'r') as f: code_content += f.read()
                elif f_path.endswith(".ipynb"):
                    with open(f_path,'r') as f:
                        nb = json.load(f)
                        for c in nb['cells']: 
                            if c['cell_type']=='code': code_content += "".join(c['source']) + "\n"
            
            if len(code_content) < 50: return "Code was empty."
            return f"--- CODE FROM {top_k.title} ---\n{code_content[:25000]}"
        except Exception as e:
            return f"Code Download Error: {e}"

class StrategistAgent:
    def run(self, slug, goal, forum_intel, code_intel):
        print(f"üß† [Strategist] Formulating plan...")
        
        prompt = f"""
        Act as a Kaggle Grandmaster.
        Competition: {slug}
        
        1. OFFICIAL GOAL: {goal}
        2. COMMUNITY INTEL: {forum_intel}
        3. EXISTING CODE BASELINE: {code_intel[:2000]}... (truncated)
        
        Task: Write a 'Winning Strategy' report.
        - Critique the baseline.
        - Identify the specific model architecture we should build.
        - Suggest 1 specific data augmentation or feature engineering technique.
        - Define the validation strategy.
        """
        
        model = genai.GenerativeModel(model_name)
        return safe_generate(model, prompt)

class CodeGeneratorAgent:
    def run(self, strategy_report):
        print(f"üèóÔ∏è [Builder] Writing final solution.py...")
        
        prompt = f"""
        You are an expert Python Developer.
        
        Based on this strategy:
        {strategy_report}
        
        Write a COMPLETE, RUNNABLE 'main.py' script.
        - Include Dataset class, Model class, and Training Loop.
        - Use PyTorch.
        - Handle the specific data types mentioned (e.g. Images, CSVs).
        """
        
        model = genai.GenerativeModel(model_name)
        return safe_generate(model, prompt)

# --- 4. THE ORCHESTRATOR ---

class KaggleCommandSystem:
    def __init__(self):
        self.scout = ProblemScoutAgent()
        self.forum = ForumScoutAgent()
        self.notebook = NotebookScoutAgent()
        self.strategist = StrategistAgent()
        self.builder = CodeGeneratorAgent()
        
    def execute(self, competition_slug):
        print(f"üöÄ STARTING KAGGLE COMMAND FOR: {competition_slug}\n" + "="*50)
        
        goal_data = self.scout.run(competition_slug)
        forum_data = self.forum.run(competition_slug)
        code_data = self.notebook.run(competition_slug)
        
        strategy = self.strategist.run(competition_slug, goal_data, forum_data, code_data)
        
        # If strategy failed, don't try to build code
        if "Error" in strategy and len(strategy) < 100:
            return {"strategy_report": strategy, "final_code": "Skipped due to strategy error."}
            
        final_code = self.builder.run(strategy)
        
        return {
            "strategy_report": strategy,
            "final_code": final_code
        }

# --- 5. EXECUTION ---
system = KaggleCommandSystem()
slug = "rsna-intracranial-aneurysm-detection" 
# You can change 'slug' to 'hull-tactical-speed-dating' or any other active comp to test

try:
    result = system.execute(slug)
    
    from IPython.display import Markdown, display
    print("\n" + "="*50)
    display(Markdown(f"## üìÑ STRATEGY REPORT\n{result['strategy_report']}"))
    print("\n" + "="*50)
    display(Markdown(f"## üêç GENERATED CODE\n{result['final_code']}"))
    
    with open("submission.py", "w") as f:
        f.write(result['final_code'])
    print("‚úÖ Code saved to 'submission.py'")

except Exception as e:
    print(f"‚ùå Fatal System Error: {e}")

üöÄ STARTING KAGGLE COMMAND FOR: rsna-intracranial-aneurysm-detection
üïµÔ∏è [Scout] Analyzing competition: rsna-intracranial-aneurysm-detection...
üì° [Forum] Scanning discussions for: rsna-intracranial-aneurysm-detection...
üë®‚Äçüíª [Notebook] Hunting for top code...
   (Found: RSNA Aneurysm Detection Demo Submission)
üß† [Strategist] Formulating plan...
üèóÔ∏è [Builder] Writing final solution.py...



## üìÑ STRATEGY REPORT
As a Kaggle Grandmaster, let's formulate a winning strategy for the RSNA Intracranial Aneurysm Detection competition. The objective is clear: detect and localize intracranial aneurysms on 3D CT scans, optimizing for Weighted Log Loss within strict inference time limits.

---

## Winning Strategy: RSNA Intracranial Aneurysm Detection

### 1. Critique of the Baseline

The provided baseline code (`predict` function) is a *minimal submission template*, not an actual aneurysm detection solution. Its critical shortcomings are:

1.  **No Predictive Logic:** It completely lacks any machine learning or image processing for aneurysm detection. It simply reads DICOM files and returns hardcoded, non-informative probabilities (0.5 for 'Aneurysm Present', 0.1 for others). This would score extremely poorly.
2.  **Lack of Preprocessing:** While it identifies DICOM tags, it performs no critical preprocessing steps like Hounsfield Unit (HU) windowing, intensity normalization, resampling to a common resolution, or handling varying slice thicknesses ‚Äì all essential for 3D medical image analysis.
3.  **Inefficient Data Handling:** It iterates through all files and uses `pydicom`, which is standard, but doesn't hint at optimized 3D volume reconstruction or batched processing, crucial for performance on large datasets.
4.  **No Model Integration:** There's no placeholder for loading a trained model, performing inference, or processing the model's raw output into the required 14-label probability format.

**Conclusion:** The baseline serves only as a structural guide for submission, providing no competitive advantage whatsoever. Our entire strategy must revolve around replacing its core with a robust, high-performance deep learning pipeline.

### 2. Model Architecture: 3D Multi-Scale U-Net with Attention

Given the 3D nature of the data, the need to detect small, irregularly shaped objects (aneurysms) in specific anatomical locations, and the computational constraints, a robust 3D segmentation-based approach is optimal.

**Proposed Architecture: 3D UNETR (UNETR: Transformers for 3D Medical Image Segmentation)**

*   **Rationale:**
    *   **3D Capability:** Designed for volumetric data, directly processing the 3D CT scans.
    *   **Encoder-Decoder Structure (U-Net style):** Provides excellent spatial context and fine-grained localization.
    *   **Vision Transformer (ViT) Encoder:** The core innovation of UNETR is using a ViT as the encoder. This allows for capturing global dependencies and long-range contextual information, which is critical for identifying subtle aneurysms amidst complex vascular structures. ViTs have demonstrated superior performance in various vision tasks, including medical imaging, by overcoming the local receptive field limitations of traditional CNNs.
    *   **Multi-scale Feature Fusion:** The decoder directly leverages skip connections from the ViT encoder to fuse high-resolution spatial information with the rich semantic features learned by the transformer, ensuring both accurate localization and fine detail preservation.
    *   **Multi-Head Output:** Instead of a single segmentation map, the decoder will output **14 distinct segmentation probability maps** (13 anatomical locations + 1 general 'Aneurysm Present' map). This allows the model to learn distinct features for each aneurysm type/location.
*   **Implementation Details:**
    *   **Input Preprocessing:** Standardize HU values (e.g., windowing to a relevant range like -100 to 400 for vascular structures, clipping, and then normalizing to [0,1]). Resample all volumes to a consistent isotropic resolution (e.g., 0.5mm x 0.5mm x 0.5mm) to handle varying slice thickness and pixel spacing.
    *   **Patching Strategy:** The ViT encoder will process input volumes as sequences of 3D patches. Overlapping patches or hierarchical patching might be considered for better context.
    *   **Loss Function:** A combination of **Dice Loss** (for segmentation, to handle class imbalance at the voxel level) and **Weighted Log Loss** (for the final aggregated probabilities, directly optimizing the competition metric). The 'Aneurysm Present' map can be trained to be an aggregation of all specific location maps.
    *   **Aggregation for Final Output:** After obtaining 14 voxel-wise probability maps, we will define **pre-computed anatomical regions of interest (ROIs)** for each of the 13 locations. For each ROI, we'll extract the **maximum probability** from the corresponding segmentation map. The 'Aneurysm Present' probability can be derived from the global 'Aneurysm Present' segmentation map (e.g., max probability across the entire brain) or as the maximum probability across all 13 location-specific probabilities. This allows for precise localization while providing the required global and regional classification probabilities.

### 3. Data Augmentation / Feature Engineering: **Advanced Anomaly Synthesis (Feature Engineering)**

Instead of standard augmentations like rotation/scaling (which will also be applied), let's focus on a powerful feature engineering technique that directly addresses the core challenge: detecting rare events.

**Specific Technique: Synthetic Aneurysm Generation and Insertion**

*   **Problem:** Aneurysms are rare, leading to extreme class imbalance. Models often struggle to learn robust features for rare positive samples.
*   **Methodology:**
    1.  **Extract Real Aneurysm Patches:** From positive training cases, extract 3D patches (e.g., 32x32x32 voxels) centered around detected aneurysms. These patches capture the true morphology and appearance of aneurysms.
    2.  **Deformation and Variation:** Apply subtle random transformations (scaling, rotation, non-rigid deformations, intensity variations) to these extracted aneurysm patches to create diverse synthetic variations. This mimics natural biological variability.
    3.  **Insert into Negative Cases:** Select regions in aneurysm-free (negative) CT scans that are anatomically plausible for an aneurysm (e.g., near vessel bifurcations, based on a vascular tree segmentation). Carefully "paste" or blend the synthetic aneurysm patches into these locations.
    4.  **Label Generation:** Generate corresponding ground truth segmentation masks and classification labels for these newly synthesized aneurysms.
*   **Benefits:**
    *   **Massively Increases Positive Samples:** Artificially inflates the number of aneurysm instances, mitigating extreme class imbalance.
    *   **Improves Generalization:** By placing synthetic aneurysms in diverse anatomical contexts and varying their appearance, the model learns to detect them more robustly, reducing overfitting to specific aneurysm presentations in the training data.
    *   **Focuses Model Attention:** Forces the model to learn features that distinguish real aneurysms from normal vessel structures, even in challenging, aneurysm-free backgrounds.
    *   **Addresses Small Object Detection:** Provides more examples of small, hard-to-detect objects, which is critical for aneurysm detection.

### 4. Validation Strategy: Stratified Patient-Level K-Fold Cross-Validation with Custom Weighted Log Loss

A robust validation strategy is paramount for ensuring our model generalizes well and accurately reflects competition performance.

1.  **Patient-Level K-Fold Split:**
    *   **Critical:** Data from the same patient *must* only appear in either the training or validation set, never both. Using `StudyInstanceUID` or `PatientID` to group studies ensures no data leakage.
    *   **K-Folds:** Use 5-Fold Cross-Validation (K=5). This provides a good balance between leveraging enough data for training each fold and having a stable estimate of out-of-fold performance.
2.  **Stratification:**
    *   **Primary Stratification:** Stratify folds based on the **'Aneurysm Present'** label. This ensures each fold has a representative distribution of positive and negative cases.
    *   **Secondary Stratification (Optional but Recommended):** Further stratify by the *number* of aneurysms per patient or the presence of aneurysms in specific, challenging locations (e.g., 'Basilar Tip'), especially if these locations are particularly rare or have higher weights in the metric. This helps ensure all challenging scenarios are present in each fold's validation set.
3.  **Metric Matching:**
    *   **Custom Weighted Log Loss:** Implement the exact competition metric (Weighted Log Loss, including the specific weights for 'Aneurysm Present' and individual locations) as our primary validation metric. This ensures we are optimizing directly for the competition's objective.
    *   **Monitoring:** Track both the overall Weighted Log Loss and individual Log Loss for 'Aneurysm Present' and specific locations during training to identify areas of weakness.
4.  **Ensembling and Confidence Calibration:**
    *   **Model Averaging:** Train an independent model for each of the K folds. For final submission, average the predictions from all K models (test-time augmentation could also be used). This significantly reduces variance and improves robustness.
    *   **Temperature Scaling / Platt Scaling:** After training, apply a post-processing calibration technique (e.g., temperature scaling on a held-out calibration set or validation set) to fine-tune the output probabilities. Weighted Log Loss is sensitive to well-calibrated probabilities.
5.  **Inference Time Simulation:**
    *   **Local Test Set:** Create a small, representative local test set to simulate the inference environment. Regularly test model inference speed and memory footprint on this set to ensure compliance with the 30-minute per series limit. This is crucial given the 3D data and potential for large models.

This comprehensive strategy, from advanced architecture and data engineering to rigorous validation, aims to build a robust and high-performing solution for aneurysm detection.




## üêç GENERATED CODE
Error: Generation failed with Timeout of 600.0s exceeded, last exception: 503 upstream request timeout

‚úÖ Code saved to 'submission.py'
