# Soil Bioinformatics ML - Fused Model Demo

This notebook demonstrates how to combine sensor data with nanopore sequencing data using our fused model approach.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_pipeline.sensor_simulator import SensorSimulator
from preprocessing.data_processor import DataProcessor
from training.fused_model import FusedModelTrainer

## Generate Simulated Data

For this demo, we'll generate both sensor data and simulated nanopore data

In [None]:
# Initialize simulators
config_path = "../configs/simulation_config.yaml"
sensor_simulator = SensorSimulator(config_path)

# Generate sensor data
sensor_data = sensor_simulator.generate_sensor_data(duration_hours=24)

# Simulate nanopore data (random sequences for demonstration)
def simulate_nanopore_data(num_samples, sequence_length=4000):
    # Simulate simple synthetic nanopore signals
    data = np.random.normal(0, 1, (num_samples, 1, sequence_length))
    # Add some pattern
    for i in range(num_samples):
        pos = np.random.randint(0, sequence_length-100)
        data[i, 0, pos:pos+100] += np.sin(np.linspace(0, 2*np.pi, 100))
    return data

nanopore_data = simulate_nanopore_data(len(sensor_data))

## Visualize Both Data Types

In [None]:
plt.figure(figsize=(15, 10))

# Plot sensor data
plt.subplot(2, 1, 1)
plt.plot(sensor_data['timestamp'], sensor_data['co2_ppm'], label='CO2')
plt.plot(sensor_data['timestamp'], sensor_data['moisture_percent'], label='Moisture')
plt.plot(sensor_data['timestamp'], sensor_data['ph'], label='pH')
plt.title('Sensor Data')
plt.legend()

# Plot example nanopore signal
plt.subplot(2, 1, 2)
plt.plot(nanopore_data[0, 0, :1000])
plt.title('Example Nanopore Signal (first 1000 points)')

plt.tight_layout()
plt.show()

## Prepare Data for Fused Model

In [None]:
# Process sensor data
processor = DataProcessor(config_path)
sensor_sequences, sensor_labels = processor.process_sensor_data("../data/raw/sensor_data.csv")

# Prepare matching nanopore data
nanopore_sequences = nanopore_data[:len(sensor_sequences)]

print(f"Prepared {len(sensor_sequences)} sequences for training")

## Train Fused Model

In [None]:
# Initialize and train fused model
fused_trainer = FusedModelTrainer(config_path)
fused_trainer.train(
    time_series_data=sensor_sequences,
    nanopore_data=nanopore_sequences,
    labels=sensor_labels,
    epochs=50
)

# Save the trained model
fused_trainer.save_model("../models/fused_model/fused_model.pth")

## Make Predictions with Fused Model

In [None]:
# Generate test data
test_sensor_data = sensor_simulator.generate_sensor_data(duration_hours=2)
test_nanopore_data = simulate_nanopore_data(len(test_sensor_data))

# Process test data
test_sensor_sequences, _ = processor.process_sensor_data("../data/raw/test_sensor_data.csv")
test_nanopore_sequences = test_nanopore_data[:len(test_sensor_sequences)]

# Make predictions
predictions = []
for i in range(len(test_sensor_sequences)):
    pred = fused_trainer.predict(
        test_sensor_sequences[i:i+1],
        test_nanopore_sequences[i:i+1]
    )
    predictions.append(processor.inverse_transform_co2(pred)[0][0])

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(test_sensor_data['timestamp'][12:], predictions, label='Predicted CO2')
plt.plot(test_sensor_data['timestamp'], test_sensor_data['co2_ppm'], label='Actual CO2')
plt.title('Fused Model: CO2 Predictions vs Actual Values')
plt.xlabel('Time')
plt.ylabel('CO2 (ppm)')
plt.legend()
plt.show()