# Kaggle Training Pipeline for IndianBatsModel

This notebook is designed to be imported into Kaggle to train the Bat Species Classifier.
It handles:
1.  Cloning the private repository (using a GitHub Token).
2.  Installing dependencies.
3.  **Generating Annotations**: Scans raw audio files and creates labels based on folder names.
4.  Preparing the data (Spectrogram generation + Feature extraction).
5.  Configuring and running the training loop.
6.  Saving the best model.

**Prerequisites:**
*   You need a GitHub Personal Access Token (Classic) with `repo` scope.
*   Upload your audio data to Kaggle Datasets.


In [None]:
import os
import sys
import subprocess
import shutil
from pathlib import Path

print("Notebook Version: 1.2 (Robust Pathing)")

# --- CONFIGURATION ---
# PASTE YOUR GITHUB TOKEN HERE
# (Format: "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
GITHUB_TOKEN = "YOUR_TOKEN_HERE" 

# Define Paths
WORK_DIR = '/kaggle/working'
REPO_NAME = 'IndianBatsModel'
REPO_URL = f'https://{GITHUB_TOKEN}@github.com/Quarkisinproton/IndianBatsModel.git'
REPO_DIR = os.path.join(WORK_DIR, REPO_NAME)

# Input Data Paths (Adjust these to match your Kaggle Dataset structure)
RAW_AUDIO_DIRS = [
    '/kaggle/input/pip-ceylonicusbat-species',
    '/kaggle/input/pip-tenuisbat-species'
]

# Output Paths
# JSON_DIR: Where the GENERATED annotations will be saved.
JSON_DIR = '/kaggle/working/data/annotations_json_folder' 

SPECT_OUT = os.path.join(WORK_DIR, 'data/processed/spectrograms')
FEATURES_OUT = os.path.join(WORK_DIR, 'data/processed/features')
FEATURES_CSV = os.path.join(FEATURES_OUT, 'end_frequencies.csv')
MODEL_SAVE_PATH = os.path.join(WORK_DIR, 'models', 'bat_fused_best.pth')

print("Configuration set.")

In [None]:
# 1. Clone Repository
if os.path.exists(REPO_DIR):
    print(f"Removing existing repo at {REPO_DIR}...")
    shutil.rmtree(REPO_DIR)

print("Cloning repository...")
safe_url = REPO_URL.replace(GITHUB_TOKEN, "********") if "ghp" in GITHUB_TOKEN else REPO_URL
print(f"Cloning from: {safe_url}")

try:
    # FORCE BRANCH 'main' to ensure we get the latest code
    subprocess.run(['git', 'clone', '-b', 'main', REPO_URL, REPO_NAME], cwd=WORK_DIR, check=True)
    print("Clone successful.")
except subprocess.CalledProcessError as e:
    print(f"Error cloning repo: {e}")
    # Fallback if token is missing/invalid, maybe the user uploaded the code manually?
    if not os.path.exists(REPO_DIR):
        print("CRITICAL: Repository not found. Please check your token or upload code manually.")

# Setup Environment
# Robustly find the 'src' directory (in case of nested folders like IndianBatsModel/IndianBatsModel)
PROJECT_ROOT = REPO_DIR
found_src = False
for root, dirs, files in os.walk(REPO_DIR):
    if 'src' in dirs:
        PROJECT_ROOT = root
        found_src = True
        print(f"Found 'src' directory at: {os.path.join(root, 'src')}")
        print(f"Setting PROJECT_ROOT to: {PROJECT_ROOT}")
        break

if not found_src:
    print("WARNING: 'src' directory not found in repository. Listing files for debug:")
    subprocess.run(['find', REPO_DIR, '-maxdepth', '3', '-not', '-path', '*/.*'], check=False)

os.chdir(PROJECT_ROOT)
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Debug: Print Commit Info
print("Current Commit:")
subprocess.run(['git', 'log', '-1'], check=False)

# Prepare env for subprocesses
env = os.environ.copy()
env['PYTHONPATH'] = PROJECT_ROOT + os.pathsep + env.get('PYTHONPATH', '')

In [None]:
# 2. Install Dependencies
print("Installing dependencies...")
subprocess.run([sys.executable, '-m', 'pip', 'install', 'librosa', 'pyyaml', 'pandas', 'matplotlib'], check=True)
print("Dependencies installed.")

In [None]:
# 2.5 Generate Annotations (Since we don't have JSONs for all files)
# This step creates "dummy" annotations assuming each file contains one call of the species.
# We use 'folder' strategy: The folder name (e.g., 'pip-ceylonicusbat-species') becomes the label.
# If you want the filename to be the label, change label_strategy to 'filename'.

print("Generating Annotations...")
Path(JSON_DIR).mkdir(parents=True, exist_ok=True)

# Use PROJECT_ROOT found in step 1
gen_script = os.path.join(PROJECT_ROOT, 'src', 'data_prep', 'generate_annotations.py')
print(f"Looking for script at: {gen_script}")

if not os.path.exists(gen_script):
    print(f"ERROR: Script not found at {gen_script}")
    # Fallback search for the file specifically
    print("Searching for generate_annotations.py...")
    try:
        # Find all matches, decode, strip whitespace, and split by newline
        found_scripts = subprocess.check_output(['find', REPO_DIR, '-name', 'generate_annotations.py']).decode().strip().split('\n')
        # Filter out empty strings
        found_scripts = [s for s in found_scripts if s]
        
        if found_scripts:
            # Take the first match
            found_script = found_scripts[0]
            print(f"Found script at: {found_script}")
            gen_script = found_script
        else:
            print("CRITICAL: generate_annotations.py not found anywhere!")
            subprocess.run(['find', REPO_DIR, '-maxdepth', '3', '-not', '-path', '*/.*'], check=False)
    except subprocess.CalledProcessError:
        print("Error running find command.")

cmd_gen_ann = [
    sys.executable, gen_script, # Run by path to avoid module issues
    '--raw_audio_dirs', *RAW_AUDIO_DIRS,
    '--output_dir', JSON_DIR,
    '--label_strategy', 'folder' # Change to 'filename' if you prefer
]
subprocess.run(cmd_gen_ann, check=True, env=env)
print("Annotation generation complete.")

In [None]:
# 3. Data Preparation: Spectrograms
# Ensure output directories exist
Path(SPECT_OUT).mkdir(parents=True, exist_ok=True)
Path(FEATURES_OUT).mkdir(parents=True, exist_ok=True)

print("Generating Spectrograms...")
cmd_spect = [
    sys.executable, '-m', 'src.data_prep.wombat_to_spectrograms',
    '--raw_audio_dir', *RAW_AUDIO_DIRS,
    '--json_dir', JSON_DIR,
    '--out_dir', SPECT_OUT,
    '--species_key', 'label'
]
subprocess.run(cmd_spect, check=True, env=env)
print("Spectrogram generation complete.")

In [None]:
# 4. Data Preparation: Features
print("Extracting Features...")
cmd_feat = [
    sys.executable, '-m', 'src.data_prep.extract_end_frequency',
    '--raw_audio_dir', *RAW_AUDIO_DIRS,
    '--json_dir', JSON_DIR,
    '--out_csv', FEATURES_CSV,
    '--species_key', 'label'
]
subprocess.run(cmd_feat, check=True, env=env)
print(f"Features extracted to {FEATURES_CSV}")

In [None]:
# 5. Create Configuration
cfg_dir = Path('configs')
cfg_dir.mkdir(exist_ok=True)
cfg_path = cfg_dir / 'config.yaml'

cfg_content = f"""
data:
  train_spectrograms: "{SPECT_OUT}"
  features_csv: "{FEATURES_CSV}"
  num_classes: 3

training:
  batch_size: 8
  learning_rate: 1e-4
  num_epochs: 10
  model_save_path: "{MODEL_SAVE_PATH}"
  num_workers: 2
"""
cfg_path.write_text(cfg_content.strip())
print(f'Wrote config to {cfg_path}')

In [None]:
# 6. Run Training
print("Starting Training...")
cmd_train = [sys.executable, 'src/train.py']
subprocess.run(cmd_train, check=True, env=env)
print("Training Complete!")

if os.path.exists(MODEL_SAVE_PATH):
    print(f"Model saved successfully at: {MODEL_SAVE_PATH}")
else:
    print("Warning: Model file not found after training.")