In [6]:
# First, let's check PyTorch and CUDA compatibility
try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU device: {torch.cuda.get_device_name(0)}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
except Exception as e:
    print("PyTorch is not installed or failed to import.")
    print(f"Import error: {e}")
    print("")
    print("1) Conda (recommended):")
    print("   conda install pytorch==2.8.0 torchvision torchaudio pytorch-cuda=12.8 -c pytorch -c nvidia -y")
    print("2) Pip (example using cu128 wheel index):")
    print("   python -m pip install --upgrade pip")
    print("   python -m pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128")
    print("3) CPU-only (if you don't have a compatible GPU):")
    print("   python -m pip install --upgrade pip; python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")

    print("Make sure your NVIDIA driver supports CUDA 12.8 (check with 'nvidia-smi'). See https://pytorch.org/get-started/locally/ for other options.")    
    print("   conda install pytorch torchvision torchaudio pytorch-cuda=12.2 -c pytorch -c nvidia")
    print("\nChoose the command that matches your CUDA version or use the PyTorch Get Started selector: https://pytorch.org/get-started/locally/")

PyTorch version: 2.8.0+cu128
CUDA available: True
PyTorch is not installed or failed to import.
Import error: module 'torch' has no attribute 'version'

1) Conda (recommended):
   conda install pytorch==2.8.0 torchvision torchaudio pytorch-cuda=12.8 -c pytorch -c nvidia -y
2) Pip (example using cu128 wheel index):
   python -m pip install --upgrade pip
   python -m pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128
3) CPU-only (if you don't have a compatible GPU):
   python -m pip install --upgrade pip; python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
Make sure your NVIDIA driver supports CUDA 12.8 (check with 'nvidia-smi'). See https://pytorch.org/get-started/locally/ for other options.
   conda install pytorch torchvision torchaudio pytorch-cuda=12.2 -c pytorch -c nvidia

Choose the command that matches your CUDA version or use the PyTorch Get Started selector: https://pytorch.org/get-started/locally/


In [10]:
import os
import pandas as pd
import ast
pd.set_option('display.max_columns', None)

import numpy as np

# Set device - with fallback to CPU if GPU not available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# If using CUDA, set memory management
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    # Enable TF32 for better performance on newer GPUs
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

Using device: cuda


In [11]:
# CONSTANTS
data_path = r"./data/interm/data_to_train_meantime.csv"

In [13]:
# LOAD THE ESSENTIALS
df = pd.read_csv("job_table.csv")

In [None]:
# Drop helper/index columns if present (use errors='ignore' to avoid KeyError)
df = df.drop(columns=["Unnamed: 0", "submit_time"], errors='ignore')

In [14]:
df

Unnamed: 0,cores_alloc_layout,cores_allocated,cores_per_task,derived_ec,eligible_time,end_time,group_id,job_id,job_state,nodes,num_cores_req,num_cores_alloc,num_nodes_req,num_nodes_alloc,num_tasks,partition,priority,qos,req_nodes,req_switch,run_time,shared,start_time,state_reason,submit_time,threads_per_core,time_limit,num_gpus_req,num_gpus_alloc,mem_req,mem_alloc,user_id,node_power_consumption,mem_power_consumption,cpu_power_consumption
0,"{900: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","{900: 128, 915: 128, 902: 128, 901: 128, 904: ...",4,1:0,2020-05-31 22:09:29+00:00,2020-05-31 22:21:33+00:00,25200,2913594,CANCELLED,[900 901 902 903 904 905 906 907 908 909 910 9...,256,2048,16,16,64.0,1,330603,1,,0,723,0,2020-05-31 22:09:30+00:00,,2020-05-31 22:09:29+00:00,,270,64,64,475,3800,310,[7970 8450 8460 8470 7440 8470 8460 8470 7910 ...,[418 724 724 678 556 654 606 600 600 488 606 4...,[ 948 1628 1650 1544 1260 1532 1418 1700 1710 ...
1,"{687: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","{687: 128, 688: 128, 681: 128, 682: 128, 680: ...",4,1:0,2020-05-31 22:22:08+00:00,2020-05-31 22:41:25+00:00,25200,4063066,CANCELLED,[680 681 682 683 684 685 686 687 688 689 690 6...,256,2048,16,16,64.0,1,81394,1,,0,1157,0,2020-05-31 22:22:08+00:00,,2020-05-31 22:22:08+00:00,,270,64,64,475,3800,310,[7970 8430 7940 8480 7460 8490 6890 8480 8480 ...,[720 738 736 614 720 642 632 524 598 628 616 5...,[1640 1604 1592 1364 1532 1508 1528 1476 1674 ...
2,"{687: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","{687: 128, 688: 128, 681: 128, 682: 128, 680: ...",4,1:0,2020-05-31 22:41:38+00:00,2020-05-31 22:56:20+00:00,25200,5730196,CANCELLED,[680 681 682 683 684 685 686 687 688 689 690 6...,256,2048,16,16,64.0,1,80615,1,,0,881,0,2020-05-31 22:41:39+00:00,,2020-05-31 22:41:38+00:00,,270,64,64,475,3800,310,[7950 7970 8500 8480 8470 6900 8460 8450 8470 ...,[672 720 716 630 674 474 644 606 602 564 650 5...,[1654 1600 1606 1438 1506 1108 1496 1670 1680 ...
3,"{416: [0, 1, 2, 3, 4, 5, 6, 7]}",{416: 32},32,1:0,2020-05-31 23:26:23+00:00,2020-05-31 23:45:16+00:00,25200,3047960,FAILED,[416],32,32,1,1,0.0,1,206885,1,,0,1133,OK,2020-05-31 23:26:23+00:00,NonZeroExitCode,2020-05-31 23:26:23+00:00,,30,4,4,59,59,379,[860 860 860 860 860 860 860 860 860 860 860 8...,[38 40 46 44 48 40 44 46 42 40 40 40 38 46 44 ...,[108 182 178 182 190 174 188 186 190 176 176 1...
4,"{416: [0, 1, 2, 3, 4, 5, 6, 7]}",{416: 32},32,1:0,2020-05-31 23:08:01+00:00,2020-05-31 23:25:27+00:00,25200,5749077,FAILED,[416],32,32,1,1,0.0,1,330339,1,,0,1046,OK,2020-05-31 23:08:01+00:00,NonZeroExitCode,2020-05-31 23:08:01+00:00,,30,4,4,59,59,379,[860 860 860 860 860 860 860 860 860 860 860 8...,[36 44 42 42 44 40 44 42 50 42 42 46 48 42 44 ...,[ 82 182 178 180 170 168 168 192 196 166 196 1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231233,"{609: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",{609: 128},32,0:0,2020-10-07 05:38:18+00:00,2020-10-07 05:38:21+00:00,25200,680209,COMPLETED,[609],128,128,1,1,4.0,1,150415,1,,0,2,OK,2020-10-07 05:38:19+00:00,,2020-10-07 05:38:18+00:00,,1440,4,4,237,237,8,[920],[36],[90]
231234,"{386: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",{386: 128},1,0:125,2020-10-07 07:38:03+00:00,2020-10-07 07:57:22+00:00,25200,2306759,COMPLETED,[386],4,128,1,1,4.0,1,306792,1,[386],0,387,0,2020-10-07 07:50:55+00:00,,2020-10-07 07:38:03+00:00,,30,4,4,7,237,182,[870 860 860 860 860 870 860 870 860 860 860 8...,[44 36 36 36 36 36 36 36 52 36 36 36 36 36 36 ...,[ 96 92 100 90 92 94 98 94 96 90 90 ...
231235,"{55: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...",{55: 128},16,0:0,2020-10-07 09:56:10+00:00,2020-10-07 13:02:03+00:00,25200,2276256,COMPLETED,[55],128,128,1,1,8.0,1,1847,11,,0,5494,OK,2020-10-07 11:30:29+00:00,,2020-10-07 09:56:10+00:00,,540,4,4,234,234,666,[ 620 620 610 630 630 630 620 620 630 ...,[38 40 38 38 38 42 38 38 40 38 38 38 38 38 40 ...,[282 202 246 234 274 288 274 288 250 252 284 2...
231236,"{858: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",{858: 128},32,0:0,2020-10-07 06:04:00+00:00,2020-10-07 06:04:02+00:00,25200,4921388,COMPLETED,[858],128,128,1,1,4.0,1,141682,1,,0,2,OK,2020-10-07 06:04:00+00:00,,2020-10-07 06:04:00+00:00,,1440,4,4,237,237,8,[860],[36],[46]


In [15]:
columns_set = set(df.columns.values.tolist())
target_set = set(["job_mean_power_consumption","job_min_power_consumption","job_max_power_consumption"])
Y_columns = list(target_set)
X_columns = list(columns_set - target_set)

In [16]:
df[X_columns]

Unnamed: 0,qos,num_cores_req,num_nodes_alloc,num_tasks,num_gpus_alloc,priority,user_id,cores_alloc_layout,submit_time,num_nodes_req,state_reason,mem_req,end_time,start_time,mem_power_consumption,group_id,nodes,num_cores_alloc,partition,run_time,cpu_power_consumption,node_power_consumption,job_id,job_state,threads_per_core,eligible_time,req_switch,derived_ec,time_limit,num_gpus_req,cores_allocated,mem_alloc,cores_per_task,req_nodes,shared
0,1,256,16,64.0,64,330603,310,"{900: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-05-31 22:09:29+00:00,16,,475,2020-05-31 22:21:33+00:00,2020-05-31 22:09:30+00:00,[418 724 724 678 556 654 606 600 600 488 606 4...,25200,[900 901 902 903 904 905 906 907 908 909 910 9...,2048,1,723,[ 948 1628 1650 1544 1260 1532 1418 1700 1710 ...,[7970 8450 8460 8470 7440 8470 8460 8470 7910 ...,2913594,CANCELLED,,2020-05-31 22:09:29+00:00,0,1:0,270,64,"{900: 128, 915: 128, 902: 128, 901: 128, 904: ...",3800,4,,0
1,1,256,16,64.0,64,81394,310,"{687: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-05-31 22:22:08+00:00,16,,475,2020-05-31 22:41:25+00:00,2020-05-31 22:22:08+00:00,[720 738 736 614 720 642 632 524 598 628 616 5...,25200,[680 681 682 683 684 685 686 687 688 689 690 6...,2048,1,1157,[1640 1604 1592 1364 1532 1508 1528 1476 1674 ...,[7970 8430 7940 8480 7460 8490 6890 8480 8480 ...,4063066,CANCELLED,,2020-05-31 22:22:08+00:00,0,1:0,270,64,"{687: 128, 688: 128, 681: 128, 682: 128, 680: ...",3800,4,,0
2,1,256,16,64.0,64,80615,310,"{687: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-05-31 22:41:38+00:00,16,,475,2020-05-31 22:56:20+00:00,2020-05-31 22:41:39+00:00,[672 720 716 630 674 474 644 606 602 564 650 5...,25200,[680 681 682 683 684 685 686 687 688 689 690 6...,2048,1,881,[1654 1600 1606 1438 1506 1108 1496 1670 1680 ...,[7950 7970 8500 8480 8470 6900 8460 8450 8470 ...,5730196,CANCELLED,,2020-05-31 22:41:38+00:00,0,1:0,270,64,"{687: 128, 688: 128, 681: 128, 682: 128, 680: ...",3800,4,,0
3,1,32,1,0.0,4,206885,379,"{416: [0, 1, 2, 3, 4, 5, 6, 7]}",2020-05-31 23:26:23+00:00,1,NonZeroExitCode,59,2020-05-31 23:45:16+00:00,2020-05-31 23:26:23+00:00,[38 40 46 44 48 40 44 46 42 40 40 40 38 46 44 ...,25200,[416],32,1,1133,[108 182 178 182 190 174 188 186 190 176 176 1...,[860 860 860 860 860 860 860 860 860 860 860 8...,3047960,FAILED,,2020-05-31 23:26:23+00:00,0,1:0,30,4,{416: 32},59,32,,OK
4,1,32,1,0.0,4,330339,379,"{416: [0, 1, 2, 3, 4, 5, 6, 7]}",2020-05-31 23:08:01+00:00,1,NonZeroExitCode,59,2020-05-31 23:25:27+00:00,2020-05-31 23:08:01+00:00,[36 44 42 42 44 40 44 42 50 42 42 46 48 42 44 ...,25200,[416],32,1,1046,[ 82 182 178 180 170 168 168 192 196 166 196 1...,[860 860 860 860 860 860 860 860 860 860 860 8...,5749077,FAILED,,2020-05-31 23:08:01+00:00,0,1:0,30,4,{416: 32},59,32,,OK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231233,1,128,1,4.0,4,150415,8,"{609: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-10-07 05:38:18+00:00,1,,237,2020-10-07 05:38:21+00:00,2020-10-07 05:38:19+00:00,[36],25200,[609],128,1,2,[90],[920],680209,COMPLETED,,2020-10-07 05:38:18+00:00,0,0:0,1440,4,{609: 128},237,32,,OK
231234,1,4,1,4.0,4,306792,182,"{386: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-10-07 07:38:03+00:00,1,,7,2020-10-07 07:57:22+00:00,2020-10-07 07:50:55+00:00,[44 36 36 36 36 36 36 36 52 36 36 36 36 36 36 ...,25200,[386],128,1,387,[ 96 92 100 90 92 94 98 94 96 90 90 ...,[870 860 860 860 860 870 860 870 860 860 860 8...,2306759,COMPLETED,,2020-10-07 07:38:03+00:00,0,0:125,30,4,{386: 128},237,1,[386],0
231235,11,128,1,8.0,4,1847,666,"{55: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12...",2020-10-07 09:56:10+00:00,1,,234,2020-10-07 13:02:03+00:00,2020-10-07 11:30:29+00:00,[38 40 38 38 38 42 38 38 40 38 38 38 38 38 40 ...,25200,[55],128,1,5494,[282 202 246 234 274 288 274 288 250 252 284 2...,[ 620 620 610 630 630 630 620 620 630 ...,2276256,COMPLETED,,2020-10-07 09:56:10+00:00,0,0:0,540,4,{55: 128},234,16,,OK
231236,1,128,1,4.0,4,141682,8,"{858: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...",2020-10-07 06:04:00+00:00,1,,237,2020-10-07 06:04:02+00:00,2020-10-07 06:04:00+00:00,[36],25200,[858],128,1,2,[46],[860],4921388,COMPLETED,,2020-10-07 06:04:00+00:00,0,0:0,1440,4,{858: 128},237,32,,OK


In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from torch.utils.data import TensorDataset, DataLoader

ModuleNotFoundError: No module named 'sklearn'

In [19]:
# MAKE THE SPLITS

X, y = df[X_columns], df[Y_columns]

### FOR CLASSIC MODELS

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.7, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit only on train!
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)


### FOR TORCH MODELS

# Convert to tensors and move to device
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create datasets and loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Adjust batch size based on GPU memory
batch_size = 2048 if torch.cuda.is_available() else 1028

# Use pin_memory for faster data transfer to GPU
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True,
    pin_memory=torch.cuda.is_available(),
    num_workers=0  # Set to 0 to avoid multiprocessing issues on Windows
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False,
    pin_memory=torch.cuda.is_available(),
    num_workers=0
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False,
    pin_memory=torch.cuda.is_available(),
    num_workers=0
)

KeyError: "None of [Index(['job_max_power_consumption', 'job_min_power_consumption',\n       'job_mean_power_consumption'],\n      dtype='object')] are in the [columns]"

In [None]:
print(f"Number of batches - Train: {len(train_loader)}, Val: {len(val_loader)}, Test: {len(test_loader)}")
print(f"Batch size: {batch_size}")

In [None]:
import sys
sys.path.append('./models/code_models/')

import sklearn_models
import torch_models
import training_utils

# SKLEARN

In [None]:
rf_model = sklearn_models.get_random_forest()

rf_trainer = training_utils.SklearnTrainer(
    model=rf_model,
    model_name="RandomForest_Default",
    project_name="Test",
    entity="iqbalch-universidad-carlos-iii-de-madrid" 
)

rf_model, rf_metrics = rf_trainer.train(
    X_train_scaled, y_train,
    X_val_scaled, y_val,
    config=rf_model.get_params()
)

print("\nValidation Metrics:")
for key, value in rf_metrics.items():
    if 'val' in key:
        print(f"{key}: {value:.4f}")

test_metrics, test_predictions = training_utils.evaluate_model(
    rf_model, X_test_scaled, y_test, model_type="sklearn"
)

print("\nTest Set Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")

# TORCH

In [18]:
input_dim = X_train_scaled.shape[1]
simple_mlp_model = torch_models.SimpleMLP(input_dim=input_dim)

# Move model to device
simple_mlp_model = simple_mlp_model.to(device)
print(f"Model moved to: {next(simple_mlp_model.parameters()).device}")

# Update trainer to use device
mlp_trainer = training_utils.PyTorchTrainer(
    model=simple_mlp_model,
    model_name="MLP Model",
    project_name="Test",
    entity="iqbalch-universidad-carlos-iii-de-madrid",
    device=device  # Pass device to trainer
)

mlp_model, mlp_best_metrics = mlp_trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=150,
    lr=0.001,
    weight_decay=1e-5,
    patience=15,
)

# Evaluate on test
mlp_test_metrics, _ = training_utils.evaluate_model(
    mlp_model, X_test_scaled, y_test, model_type="pytorch", device=device
)

print("\nTest Set Metrics:")
print(f"RMSE: {mlp_test_metrics['test_rmse_mean']:.4f}")
print(f"MAE: {mlp_test_metrics['test_mae_mean']:.4f}")
print(f"R2: {mlp_test_metrics['test_r2_mean']:.4f}")

NameError: name 'X_train_scaled' is not defined

In [None]:
# Clean up GPU memory after training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU cache cleared")