# Phase 2: DL Birefringence Prediction from POM Images

**One-click training pipeline for Google Colab (Free GPU)**

This notebook trains a hybrid CNN model to predict birefringence (Δn) from:
- POM microscopy images
- Temperature, thickness, and order

## Setup

In [None]:
# 1. Mount Google Drive (upload your project folder there)
from google.colab import drive
drive.mount('/content/drive')

# Set your project path (update this to match your Drive folder)
PROJECT_ROOT = '/content/drive/MyDrive/phase2_birefringence'

# Or clone from GitHub:
# !git clone https://github.com/YOUR_USERNAME/phase2_birefringence.git
# PROJECT_ROOT = '/content/phase2_birefringence'

In [None]:
# 2. Install dependencies
!pip install -q torch torchvision timm xgboost scikit-learn openpyxl pillow pandas numpy matplotlib seaborn tqdm

# Verify GPU
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

In [None]:
import os
os.chdir(PROJECT_ROOT)
import sys
sys.path.insert(0, 'src')

# Check project structure
!ls -la
!ls -la data/ 2>/dev/null || echo 'data/ not found yet'
!ls -la src/

## Step 1: Prepare Dataset

In [None]:
# If your dataset zip is in the project folder:
!unzip -o dataset_physics_SOP_liquid_crystal.zip -d dataset/ 2>/dev/null || echo 'Dataset already extracted or not found'

# Prepare unified CSV
!python src/prepare_dataset.py \
    --data-root dataset/dataset_physics_SOP_liquid_crystal \
    --p100-thickness 9.415 \
    --p100-order 1 \
    --pch-thickness 10.0 \
    --pch-order 1 \
    --output data/dataset_unified.csv

In [None]:
# Inspect the dataset
import pandas as pd
df = pd.read_csv('data/dataset_unified.csv')
print(f'Total samples: {len(df)}')
print(df.groupby('material').agg({
    'temperature_C': ['min', 'max', 'count'],
    'delta_n': ['min', 'max', 'mean'],
}))
df.head(10)

In [None]:
# Visualize delta_n vs temperature
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for i, mat in enumerate(df['material'].unique()):
    sub = df[df['material'] == mat].sort_values('temperature_C')
    axes[i].plot(sub['temperature_C'], sub['delta_n'], 'o-', markersize=6)
    axes[i].set_xlabel('Temperature (°C)')
    axes[i].set_ylabel('Δn')
    axes[i].set_title(mat)
    axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Step 2: Run Baseline (XGBoost + Color Features)

In [None]:
!python src/baseline_xgboost.py --csv data/dataset_unified.csv

## Step 3: Train DL Model (Hybrid CNN)

This uses EfficientNet-B0 pretrained backbone + scalar fusion.

In [None]:
# 5-Fold Cross-Validation (recommended first)
!python src/train.py \
    --csv data/dataset_unified.csv \
    --backbone efficientnet_b0 \
    --epochs 60 \
    --warmup-epochs 10 \
    --batch-size 16 \
    --lr 1e-3 \
    --n-patches 25 \
    --cv-mode kfold \
    --n-folds 5 \
    --save-final

In [None]:
# (Optional) LOOCV for more rigorous evaluation
# This takes longer but uses every image for validation once
# !python src/train.py \
#     --csv data/dataset_unified.csv \
#     --backbone efficientnet_b0 \
#     --epochs 60 \
#     --cv-mode loocv \
#     --save-final

## Step 4: Visualize Results

In [None]:
import glob

# Find latest results file
dl_results = sorted(glob.glob('results/cv_results_*.json'))[-1]
print(f'Latest DL results: {dl_results}')

!python src/visualize.py --results {dl_results} --type dl

# Display plots
from IPython.display import Image as IPImage, display
for plot in sorted(glob.glob('results/plots/*.png')):
    print(f'\n{plot}:')
    display(IPImage(plot, width=600))

In [None]:
# Compare baseline vs DL
baseline_results = sorted(glob.glob('results/baseline_results_*.json'))[-1]

import json
with open(baseline_results) as f:
    bl = json.load(f)
with open(dl_results) as f:
    dl = json.load(f)

print('='*50)
print('MODEL COMPARISON')
print('='*50)
print(f'{"Metric":<12} {"XGBoost":>12} {"CNN":>12}')
print('-'*36)
print(f'{"MAE":<12} {bl["mae"]:>12.6f} {dl["overall_mae"]:>12.6f}')
print(f'{"RMSE":<12} {bl["rmse"]:>12.6f} {dl["overall_rmse"]:>12.6f}')
print(f'{"R²":<12} {bl["r2"]:>12.4f} {dl["overall_r2"]:>12.4f}')

## Step 5: Predict on New Images

In [None]:
# Predict birefringence for a new POM image
!python src/predict.py \
    --image 'dataset/dataset_physics_SOP_liquid_crystal/P100-7CB+ Str +t3318/35.jpg' \
    --temperature 35.0 \
    --thickness 9.415 \
    --order 1 \
    --model models/birefringence_model_efficientnet_b0.pth

In [None]:
# Batch predict for all images (sanity check)
from src.predict import load_model, predict_birefringence
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = load_model('models/birefringence_model_efficientnet_b0.pth', device)

for _, row in df.iterrows():
    mean_dn, std_dn, _, _ = predict_birefringence(
        model, row['image_path'], row['temperature_C'],
        row['thickness_um'], row['order'], device, n_patches=10,
    )
    err = abs(mean_dn - row['delta_n'])
    print(f"T={row['temperature_C']:5.1f}°C | True={row['delta_n']:.5f} | "
          f"Pred={mean_dn:.5f} ± {std_dn:.5f} | Err={err:.5f} | {row['material']}")