# BioArt Data Exploration

This notebook demonstrates data exploration for the BioArt learning process.
We'll analyze genomic sequences, microbe images, and haplogroup metadata for artistic feature extraction.

**Important**: This is for computational/creative purposes only. No biological function prediction or sequence optimization.

In [None]:
# Setup and imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path().parent / 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# BioArt modules
from data.preprocessing import SequenceProcessor, ImageProcessor, MetadataProcessor
from art.palettes import DNAPaletteGenerator
from art.prompts import BiologicalPromptGenerator
from utils.visualization import plot_gc_distribution, visualize_kmer_spectrum

print("🧬 BioArt Data Exploration Notebook")
print("Educational/artistic use only - no biological function")

## 1. DNA Sequence Analysis

In [None]:
# Initialize sequence processor
seq_processor = SequenceProcessor(kmer_sizes=[3, 4, 5, 6])

# Demo sequences for exploration
demo_sequences = [
    "ATCGATCGATCGATCG" * 50,  # Repetitive
    "GCGCGCGCGCGCGCGC" * 50,  # GC-rich
    "ATATATATATATTAT" * 50,   # AT-rich
    "ACGTACGTACGTACGT" * 50,  # Balanced
]

print(f"Analyzing {len(demo_sequences)} demo sequences...")

# Extract features
features = seq_processor.extract_dna_features(demo_sequences)

print(f"\nSequence Analysis Results:")
print(f"Total sequences: {features['metadata']['total_sequences']}")
print(f"GC content range: {features['gc_content_distribution'].min():.3f} - {features['gc_content_distribution'].max():.3f}")
print(f"Average complexity: {features['complexity_scores'].mean():.3f}")

In [None]:
# Visualize GC content distribution
plot_gc_distribution(features['gc_content_distribution'], "Demo Sequences GC Content")

In [None]:
# Visualize k-mer spectrum
kmer_4_counts = features['kmer_spectra'][4]
visualize_kmer_spectrum(kmer_4_counts, k=4, top_n=15)

## 2. Color Palette Generation

In [None]:
# Initialize palette generator
palette_gen = DNAPaletteGenerator()

# Generate palettes for different GC contents
gc_values = [0.25, 0.50, 0.75]  # AT-rich, balanced, GC-rich
palette_names = ['AT-rich', 'Balanced', 'GC-rich']

for gc, name in zip(gc_values, palette_names):
    palette = palette_gen.gc_palette.generate_palette(gc, num_colors=6)
    print(f"\n{name} (GC={gc:.1%}) palette:")
    print(f"Colors: {palette[:3]}...")  # Show first 3 colors
    
    # Visualize palette
    try:
        palette_gen.visualize_palette(palette, f"{name} DNA Palette")
    except:
        print("Visualization not available (matplotlib required)")

## 3. Biological Prompt Generation

In [None]:
# Initialize prompt generator
prompt_gen = BiologicalPromptGenerator()

# Generate prompts from our demo data
for i, gc in enumerate(features['gc_content_distribution']):
    print(f"\nSequence {i+1} (GC={gc:.1%}):")
    
    # Generate different style prompts
    styles = ['scientific', 'artistic', 'organic']
    for style in styles:
        prompt = prompt_gen.generate_from_gc_content(gc, style, 'cellular structure')
        print(f"  {style.capitalize()}: {prompt}")

In [None]:
# Generate comprehensive prompts
print("\nComprehensive Prompts:")
print("=" * 50)

for i in range(3):
    gc = features['gc_content_distribution'][i]
    kmer_counts = Counter(list(features['kmer_spectra'][4].keys())[:5])  # Sample k-mers
    
    comprehensive_prompt = prompt_gen.generate_comprehensive_prompt(
        gc_content=gc,
        kmer_counts=kmer_counts,
        haplogroup='R1a',
        style='scientific'
    )
    
    print(f"\nPrompt {i+1}: {comprehensive_prompt}")

## 4. Multi-modal Feature Integration

In [None]:
# Simulate complete bioart pipeline
from art.composition import BioArtComposer

composer = BioArtComposer()

# Create biological data package
biological_data = {
    'dna_features': {
        'gc_content_mean': features['gc_content_distribution'].mean(),
        'kmer_counts': features['kmer_spectra'][4],
        'complexity_mean': features['complexity_scores'].mean()
    },
    'haplogroup_data': {
        'primary_haplogroup': 'R1a',
        'distribution': {'R1a': 60, 'R1b': 30, 'I1': 10}
    },
    'microbe_features': {
        'type': 'bacteria',
        'dominant_colors': [(120, 80, 200), (200, 150, 100)],
        'texture_energy': 45.0
    }
}

print("Creating composition plan...")
composition_plan = composer.create_composition_plan(biological_data)

print(f"\nComposition Plan Summary:")
print(f"Color scheme: {composition_plan['palette_plan']['color_scheme']}")
print(f"Layout: {composition_plan['layout_plan']['composition_type']}")
print(f"Grid size: {composition_plan['layout_plan']['grid_size']}")
print(f"Symmetry: {composition_plan['layout_plan']['symmetry']}")

In [None]:
# Generate art specification
print("Generating art specification...")
art_spec = composer.generate_art_specification(composition_plan)

print(f"\nArt Specification:")
print(f"Palette types: {list(art_spec['palettes'].keys())}")
print(f"Number of prompts: {len(art_spec['prompts'])}")

print(f"\nExample prompts:")
for i, prompt in enumerate(art_spec['prompts'][:2]):
    print(f"  {i+1}: {prompt}")

print(f"\nExample palette (combined):")
combined_palette = art_spec['palettes'].get('combined', [])
if combined_palette:
    print(f"  Colors: {combined_palette[:3]}...")  # Show first 3

## 5. Summary and Next Steps

This notebook demonstrated:

1. **DNA Sequence Analysis**: Extracting GC content, k-mer spectra, and complexity scores for artistic mapping
2. **Color Palette Generation**: Creating biologically-informed color schemes
3. **Prompt Generation**: Converting biological features into descriptive text for generative models
4. **Multi-modal Integration**: Combining genomic, visual, and metadata features

### Next Steps:

1. **Download Real Data**: Use `python scripts/download_data.py --all` to get Kaggle datasets
2. **Process Data**: Run preprocessing on real biological datasets
3. **Generate Art**: Use `python scripts/generate_art.py` to create bioart
4. **Experiment**: Try different artistic parameters and mappings

### Ethical Reminder:

- This is for **educational and artistic purposes only**
- No biological function prediction or sequence optimization
- Use only aggregated, non-identifiable features
- Respect all dataset licenses and terms of use

In [None]:
# Final demonstration: simulate complete pipeline
print("🎨 Final Pipeline Simulation")
print("=" * 40)

simulation_results = composer.simulate_generation(art_spec)

print(f"\nSimulation Results:")
print(f"Status: {simulation_results['status']}")
print(f"Generated pieces: {len(simulation_results['generated_pieces'])}")
print(f"Processing summary:")
for key, value in simulation_results['processing_summary'].items():
    print(f"  {key}: {value}")

print("\n✅ Data exploration completed!")
print("Ready to move to the next phase of bioart generation.")