# LARUN TinyML - Cloud Training Notebook

**Train the LARUN exoplanet detection model using free GPU resources**

Created by: Padmanaban Veeraragavalu (Larun Engineering)

---

## Instructions:
1. Open in Google Colab: `File > Open in Colab`
2. Enable GPU: `Runtime > Change runtime type > GPU`
3. Run all cells: `Runtime > Run all`
4. Download trained model when complete

In [None]:
# Step 1: Check GPU availability
!nvidia-smi
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

In [None]:
# Step 2: Install dependencies
!pip install -q lightkurve astroquery astropy scikit-learn

In [None]:
# Step 3: Clone LARUN repository
!git clone https://github.com/Paddy1981/larun.git
%cd larun

In [None]:
# Step 4: Configuration
NUM_PLANETS = 100        # Number of exoplanet host stars
NUM_NON_PLANETS = 100    # Number of non-planet stars  
EPOCHS = 100             # Training epochs
BATCH_SIZE = 32          # Batch size (larger with GPU)
INPUT_SIZE = 1024        # Light curve length

In [None]:
# Step 5: Parallel Data Fetching (faster than sequential)
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import lightkurve as lk
from astroquery.nasa_exoplanet_archive import NasaExoplanetArchive
import warnings
warnings.filterwarnings('ignore')

print("Fetching confirmed exoplanet hosts from NASA...")

# Get confirmed exoplanets
planets_table = NasaExoplanetArchive.query_criteria(
    table="pscomppars",
    select="hostname,pl_name,disc_facility",
    where="disc_facility like '%TESS%' or disc_facility like '%Kepler%'"
)

# Get unique host stars
planet_hosts = list(set(planets_table['hostname'].data.tolist()))[:NUM_PLANETS]
print(f"Found {len(planet_hosts)} exoplanet host stars")

In [None]:
# Step 6: Parallel light curve fetching function
def fetch_lightcurve(target, label, timeout=60):
    """Fetch and process a single light curve."""
    try:
        search = lk.search_lightcurve(target, mission=['TESS', 'Kepler'])
        if len(search) == 0:
            return None
        
        lc = search[0].download(quality_bitmask='default')
        lc = lc.remove_nans().normalize().remove_outliers(sigma=3)
        
        flux = lc.flux.value
        
        # Resample to fixed size
        if len(flux) < INPUT_SIZE:
            flux = np.pad(flux, (0, INPUT_SIZE - len(flux)), mode='median')
        else:
            # Take center portion
            start = (len(flux) - INPUT_SIZE) // 2
            flux = flux[start:start + INPUT_SIZE]
        
        return {'flux': flux, 'label': label, 'target': target}
    except Exception as e:
        return None

print("Parallel fetch function ready.")

In [None]:
# Step 7: Fetch data in parallel (MUCH faster!)
from tqdm.notebook import tqdm

data = []
MAX_WORKERS = 8  # Parallel downloads

print(f"Fetching {NUM_PLANETS} exoplanet hosts (parallel)...")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(fetch_lightcurve, host, 1): host for host in planet_hosts}
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Planet hosts"):
        result = future.result()
        if result is not None:
            data.append(result)

print(f"Successfully fetched {len(data)} planet host light curves")

In [None]:
# Step 8: Fetch non-planet stars (negative examples)
print(f"Fetching {NUM_NON_PLANETS} non-planet stars...")

# Use TIC IDs that are known to NOT have planets
non_planet_tics = [f"TIC {i}" for i in range(100000000, 100000000 + NUM_NON_PLANETS * 10, 10)]

non_planet_data = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(fetch_lightcurve, tic, 0): tic for tic in non_planet_tics[:NUM_NON_PLANETS*3]}
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Non-planet stars"):
        if len(non_planet_data) >= NUM_NON_PLANETS:
            break
        result = future.result()
        if result is not None:
            non_planet_data.append(result)

data.extend(non_planet_data[:NUM_NON_PLANETS])
print(f"Total samples: {len(data)}")

In [None]:
# Step 9: Prepare training data
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array([d['flux'] for d in data])
y = np.array([d['label'] for d in data])

# Reshape for CNN
X = X.reshape(-1, INPUT_SIZE, 1).astype(np.float32)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Class distribution: {np.bincount(y_train)}")

In [None]:
# Step 10: Build the LARUN TinyML Model
from tensorflow import keras
from tensorflow.keras import layers

def build_larun_model(input_shape, num_classes=2):
    """LARUN TinyML architecture - optimized for size and accuracy."""
    
    model = keras.Sequential([
        # Input
        keras.Input(shape=input_shape),
        
        # Conv Block 1
        layers.Conv1D(32, 7, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(4),
        layers.Dropout(0.25),
        
        # Conv Block 2
        layers.Conv1D(64, 5, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(4),
        layers.Dropout(0.25),
        
        # Conv Block 3
        layers.Conv1D(128, 3, padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.5),
        
        # Dense layers
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ], name='larun_tinyml')
    
    return model

model = build_larun_model((INPUT_SIZE, 1), num_classes=2)
model.summary()

In [None]:
# Step 11: Compile and train
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6),
    keras.callbacks.ModelCheckpoint('larun_best.h5', save_best_only=True)
]

print("Starting training...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Step 12: Evaluate and plot results
import matplotlib.pyplot as plt

# Evaluate
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"\nValidation Accuracy: {val_acc*100:.2f}%")
print(f"Validation Loss: {val_loss:.4f}")

# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['accuracy'], label='Train')
axes[0].plot(history.history['val_accuracy'], label='Validation')
axes[0].set_title('Model Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()

axes[1].plot(history.history['loss'], label='Train')
axes[1].plot(history.history['val_loss'], label='Validation')
axes[1].set_title('Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

In [None]:
# Step 13: Export to TFLite (for edge deployment)
import tensorflow as tf

# Standard TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

with open('larun_model.tflite', 'wb') as f:
    f.write(tflite_model)

# Quantized TFLite (smaller, faster)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_quant = converter.convert()

with open('larun_model_quant.tflite', 'wb') as f:
    f.write(tflite_quant)

print(f"Keras model: {model.count_params()} parameters")
print(f"TFLite model: {len(tflite_model)/1024:.2f} KB")
print(f"Quantized TFLite: {len(tflite_quant)/1024:.2f} KB")

In [None]:
# Step 14: Save everything for download
import os
os.makedirs('larun_trained', exist_ok=True)

# Save model
model.save('larun_trained/larun_model.h5')

# Save TFLite models
with open('larun_trained/larun_model.tflite', 'wb') as f:
    f.write(tflite_model)
with open('larun_trained/larun_model_quant.tflite', 'wb') as f:
    f.write(tflite_quant)

# Save training data
np.savez('larun_trained/training_data.npz', X=X, y=y)

# Save training history
import json
with open('larun_trained/history.json', 'w') as f:
    json.dump({k: [float(v) for v in vals] for k, vals in history.history.items()}, f)

# Zip for download
!zip -r larun_trained.zip larun_trained/

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print(f"Validation Accuracy: {val_acc*100:.2f}%")
print(f"Model saved to: larun_trained.zip")
print("\nDownload the zip file to use with LARUN CLI")

In [None]:
# Step 15: Download the trained model
from google.colab import files
files.download('larun_trained.zip')