In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

print(project_root)

/workspace/dna-neural-network-coi-


In [2]:
import torch
print(f"¬øGPU disponible? {torch.cuda.is_available()}")
print(f"N√∫mero de GPUs: {torch.cuda.device_count()}")
print(f"Nombre GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")

¬øGPU disponible? False
N√∫mero de GPUs: 0
Nombre GPU: No GPU


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
import json

# Tus imports
from src.combined_model.combined_model_embedding import *
from src.combined_model.combined_models import *
from src.decoders.decoder_simple import *
from src.encoders_model.DNABERT_Embedder import *
from src.encoders_model.embdeeding_encoders import *
from src.encoders_model.simple_encoders import *
from src.evaluators.linear_evaluator import *
from src.decoders.sequence_decoder import *


In [4]:
from src.encoders_model.simple_encoders import *
from src.utils.load_fastaDataset import *
from src.training.experimentRunner import *
from torch.utils.data import DataLoader

In [5]:
def load_hierarchy_from_json(json_path):
    """Carga la jerarqu√≠a taxon√≥mica desde JSON"""
    with open(json_path, 'r') as f:
        hierarchy_raw = json.load(f)
    
    # Convertir claves string a int
    hierarchy = {}
    for child_taxon, parent_dict in hierarchy_raw.items():
        hierarchy[child_taxon] = {}
        for parent_key, children_list in parent_dict.items():
            parent_int = int(float(parent_key))
            children_int = [int(c) for c in children_list]
            hierarchy[child_taxon][parent_int] = children_int
    
    print("‚úÖ Jerarqu√≠a cargada desde JSON")
    for taxon, mapping in hierarchy.items():
        n_parents = len(mapping)
        n_children = sum(len(v) for v in mapping.values())
        print(f"  {taxon:10s}: {n_parents:4d} padres ‚Üí {n_children:5d} hijos")
    
    return hierarchy

In [6]:
hierarchy_path = os.path.join(project_root, "src", "data", "taxonomy_hierarchy_fixed_with_class.json")
hierarchy = load_hierarchy_from_json(hierarchy_path)

‚úÖ Jerarqu√≠a cargada desde JSON
  class     :   49 padres ‚Üí   187 hijos
  order     :  173 padres ‚Üí   831 hijos
  family    :  797 padres ‚Üí  5446 hijos
  genus     : 5393 padres ‚Üí 50568 hijos
  species   : 50510 padres ‚Üí 205075 hijos


In [7]:
df = os.path.join(project_root, "src", "data", "all_taxa_numeric.csv")
df = pd.read_csv(df)

In [8]:
taxon_order = ['phylum', 'class','order', 'family', 'genus', 'species']
total_classes = {}
for taxon in taxon_order:
    n_classes = df[taxon].nunique()
    total_classes[taxon] = n_classes
    print(f"  {taxon:10s}: {n_classes:6d} clases")

  phylum    :     49 clases
  class     :    173 clases
  order     :    797 clases
  family    :   5393 clases
  genus     :  50510 clases
  species   : 205075 clases


In [9]:
from sklearn.model_selection import train_test_split

# Primero separar test (20%)
df_temp, df_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['phylum']
)

# Luego separar train/val (80/20 del 80% restante = 64/16 del total)
df_train, df_val = train_test_split(
    df_temp, test_size=0.2, random_state=42, stratify=df_temp['phylum']
)

In [10]:
max_length = 750

train_dataset = MultiTaxaFastaDataset(
    df_train.reset_index(drop=True), 
    max_length=max_length,
    taxon_cols=taxon_order
)

val_dataset = MultiTaxaFastaDataset(
    df_val.reset_index(drop=True),
    max_length=max_length,
    taxon_cols=taxon_order
)

test_dataset = MultiTaxaFastaDataset(
    df_test.reset_index(drop=True),
    max_length=max_length,
    taxon_cols=taxon_order
)

In [11]:
def collate_multitask(batch, taxon_cols=['phylum', 'class','order','family','genus','species'], max_length=900):
    sequences, labels_dict_list, recon_targets_list, true_tokens_list = zip(*batch)

    # Labels: dict de tensors
    labels_dict = {taxon: torch.stack([d[taxon] for d in labels_dict_list]) for taxon in taxon_cols}

    # Recon targets: dict de tensors
    recon_targets_dict = {taxon: torch.stack([d[taxon] for d in recon_targets_list]) for taxon in taxon_cols}

    # True tokens
    true_tokens = torch.stack(true_tokens_list)

    return sequences, labels_dict, recon_targets_dict, true_tokens


In [12]:


train_loader = DataLoader(
    train_dataset,
    batch_size=256,
    shuffle=True,
    drop_last=True,
    collate_fn=lambda b: collate_multitask(b, taxon_cols=val_dataset.taxon_cols, max_length=val_dataset.max_length),
    num_workers=8
)



val_loader = DataLoader(
    val_dataset,
    batch_size=256,
    shuffle=False,
    drop_last=True,
    collate_fn=lambda b: collate_multitask(b, taxon_cols=val_dataset.taxon_cols, max_length=val_dataset.max_length),
    num_workers=8
)

test_loader  = DataLoader(
    test_dataset,
    batch_size=256,
    shuffle=False,
    drop_last=True,
    collate_fn=lambda b: collate_multitask(b, taxon_cols=val_dataset.taxon_cols, max_length=val_dataset.max_length),
    num_workers=8
)


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è  Dispositivo: {device}")

# DNABERT
dnabert_path = os.path.join(project_root, "src", "data", "archives")
dnabert = DNABERTEmbedder(
    model_name=dnabert_path,
    max_length=max_length,
    device=device
)
embed_dim = dnabert.get_embedding_dim()

# Encoder
latent_dim = 256
encoder = SimpleEmbeddingEncoder(
    embed_dim=embed_dim,
    latent_dim=latent_dim,
    dropout=0.1
)

# Decoders (uno por tax√≥n)
decoders_dict = {}
for taxon in taxon_order:
    decoders_dict[taxon] = SequenceDecoder(
        latent_dim=latent_dim,
        seq_len=max_length,
        vocab_size=4,
        dropout=0.1
    )

# Global decoder
global_decoder = SequenceDecoder(
    latent_dim=latent_dim,
    seq_len=max_length,
    vocab_size=4,
    dropout=0.1
)

# Classifiers (uno por tax√≥n)
classifiers_dict = {}
print(f"\nüîß Creando classifiers con n√∫mero TOTAL de clases:")
for taxon in taxon_order:
    # ‚úÖ CORRECTO: Usar total_classes (del dataset completo)
    # ‚ùå INCORRECTO: n_classes = df_train[taxon].nunique()
    n_classes = total_classes[taxon]
    
    classifiers_dict[taxon] = CosineClassifier(
        latent_dim=latent_dim,
        num_classes=n_classes,
        scale=20.0  # ‚≠ê Ajustable si es necesario
    )
    print(f"  {taxon:10s}: {n_classes:6d} clases")

# ‚≠ê CREAR MODELO JER√ÅRQUICO
model = HierarchicalCombinedModelFixed(
    dnabert=dnabert,
    encoder=encoder,
    decoders_dict=decoders_dict,
    classifiers_dict=classifiers_dict,
    global_decoder=global_decoder,
    taxonomy_hierarchy=hierarchy  # ‚≠ê Aqu√≠ usas la jerarqu√≠a
)

print(f"\n‚úÖ Modelo jer√°rquico creado")

Some weights of BertModel were not initialized from the model checkpoint at /workspace/dna-neural-network-coi-/src/data/archives and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üñ•Ô∏è  Dispositivo: cpu

üîß Creando classifiers con n√∫mero TOTAL de clases:
  phylum    :     49 clases
  class     :    173 clases
  order     :    797 clases
  family    :   5393 clases
  genus     :  50510 clases
  species   : 205075 clases
‚úÖ Modelo jer√°rquico creado con m√°scaras suaves (no -inf)

‚úÖ Modelo jer√°rquico creado


In [14]:
runner = ExperimentRunner(model, device=device)

TARGET_ACCURACY = 0.85  # 85% para avanzar al siguiente nivel
MAX_EPOCHS_PER_LEVEL = 2  # M√°ximo 3 √©pocas por nivel
TOTAL_EPOCHS = 12  # Total de √©pocas


curriculum_trainer = AdaptiveCurriculumTrainer(
    model=model,
    runner=runner,
    device=device,
    target_accuracy=TARGET_ACCURACY,
    max_epochs_per_level=MAX_EPOCHS_PER_LEVEL
)

‚Üí Modo MULTITASK ACTIVADO (clasificaci√≥n + reconstrucci√≥n)
üéì Curriculum Learning Adaptativo Inicializado
   Target accuracy: 85%
   Max epochs por nivel: 2


In [15]:
history = curriculum_trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    total_epochs=8,  
    alpha=1.0,
    beta=1.0,
    lr=1e-3,
    mixed_precision=True,
    enable_early_stop=True  
)




üéì INICIANDO CURRICULUM LEARNING ADAPTATIVO
üéØ Threshold objetivo: 85.0%
‚è±Ô∏è  √âpocas m√°ximas por nivel: 2
üìä √âpocas m√°ximas totales: 8
üõë Early stopping: Activado

EPOCH 1/8
üìö Nivel actual: 1/6 - phylum
   √âpocas en este nivel: 1/2
   Pesos: {'phylum': 5.0, 'class': 1.0, 'order': 1.0, 'family': 1.0, 'genus': 1.0, 'species': 1.0}
üìä Pesos por tax√≥n: {'phylum': 5.0, 'class': 1.0, 'order': 1.0, 'family': 1.0, 'genus': 1.0, 'species': 1.0}
üöÄ Mixed Precision: True
üì¶ Batches por √©poca: 3861


[TRAIN] Epoch 1/1:   0%|                                                                                 | 0/3861 [00:00<?, ?it/s]


AssertionError: 

In [None]:
test_results = runner.evaluate_multitask(
    test_loader=test_loader,
    heads=taxon_order
)

In [None]:
print(f"\nüìä Resultados finales en test:")
print(f"{'='*70}")
for taxon in taxon_order:
    acc = test_results[taxon]['acc']
    f1 = test_results[taxon]['f1']
    print(f"{taxon:12s} ‚Üí Acc: {acc:6.2%} | F1: {f1:.4f}")
print(f"{'='*70}\n")

print("\n‚úÖ Script completado")

In [None]:
torch.save({
    'epoch': 10,  # √öltima √©poca completada
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'history': history,
    'config': {
        'latent_dim': 128,
        'max_length': 900,
        'taxon_cols': taxon_cols,
        'num_classes': {taxon: df[taxon].nunique() for taxon in taxon_cols}
    }
}, 'checkpoint_epoch10_completo.pt')

print("‚úÖ Checkpoint completo guardado")

In [None]:
# ===== GUARDAR MODELO COMPLETO =====
import torch
import pickle

# Despu√©s de terminar el entrenamiento
checkpoint = {
    'epoch': 10,  # o las √©pocas que completaste
    'model_state_dict': model.state_dict(),
    'history': history,
    'config': {
        'latent_dim': 128,
        'max_length': 900,
        'taxon_cols': taxon_cols,
        'num_classes': {taxon: df[taxon].nunique() for taxon in taxon_cols},
        'dnabert_path': DNABERT_MODEL
    },
    # Opcional: guardar mapeos label‚Üínombre
    'label_mappings': {
        taxon: {idx: name for idx, name in enumerate(sorted(df[taxon].unique()))}
        for taxon in taxon_cols
    }
}

torch.save(checkpoint, 'modelo_coi_10epochs.pt')
print("‚úÖ Modelo guardado en 'modelo_coi_10epochs.pt'")

# Tambi√©n guardar historial separado para an√°lisis
with open('training_history.pkl', 'wb') as f:
    pickle.dump(history, f)