In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    # Optional: print the name of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. No GPUs found.")

Number of available GPUs: 1
GPU 0: Tesla T4


In [None]:
! pip install bitsandbytes
! pip install peft
! pip install --pre deepchem


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1
Collecting deepchem
  Downloading deepchem-2.8.1.dev20260121224927-py3-none-any.whl.metadata (2.2 kB)
Collecting numpy<2 (from deepchem)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem)
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading deepchem-2.8.1.dev20260121224927-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
! pip install ai2-olmo
! pip install pytorch_lightning

Collecting ai2-olmo
  Downloading ai2_olmo-0.6.0-py3-none-any.whl.metadata (25 kB)
Collecting ai2-olmo-core==0.1.0 (from ai2-olmo)
  Downloading ai2_olmo_core-0.1.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3 (from ai2-olmo)
  Downloading boto3-1.42.36-py3-none-any.whl.metadata (6.8 kB)
Collecting cached_path>=1.6.2 (from ai2-olmo)
  Downloading cached_path-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting botocore<1.43.0,>=1.42.36 (from boto3->ai2-olmo)
  Downloading botocore-1.42.36-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->ai2-olmo)
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3->ai2-olmo)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ai2_olmo-0.6.0-py3-none-any.whl (144.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.9/144.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ai2_olmo_core-0.1.0-py3-n

In [None]:
%%writefile train.py
import torch
import pytorch_lightning as pl
import deepchem as dc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from pytorch_lightning.callbacks import ModelCheckpoint
from deepchem.molnet import load_freesolv
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
from sklearn.metrics import mean_squared_error
import re
from math import sqrt

class OlmoDataset(Dataset):
    def __init__(self, mode="Train", max_length=300):
        self.tokenizer = AutoTokenizer.from_pretrained(
            "Codemaster67/OLMo-7B-ZINC20-10k",
            trust_remote_code=True,
            padding_side="right"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        tasks, datasets, transformers = load_freesolv(featurizer="raw", splitter='scaffold')
        train, valid, test = datasets

        self.task_names = tasks

        self.mode = mode.lower()
        if self.mode == "train":
            self.data = train
        elif self.mode == "valid":
            self.data = valid
        elif self.mode == "test":
            self.data = test

        self.max_length = max_length
        self.samples = []
        self._filldataset()

    def _filldataset(self):
        for i in range(len(self.data)):
            smiles = self.data.ids[i]
            labels = self.data.y[i]
            weights = self.data.w[i]

            for task_idx, label in enumerate(labels):
                if weights[task_idx] > 0:
                    task_name = "hydration free energy" #descriptive name since the orginal task name is y
                    self.samples.append(self._create_prompt(smiles, task_name, label))

        print(f"[{self.mode.upper()}] Number of samples: {len(self.samples)}")

    def _create_prompt(self, smiles, task_name, label):
        eos_token = self.tokenizer.eos_token
        answer = f"{label:.5f}"

        full_prompt = (
            "### Instruction:\n"
            f"Predict the {task_name} (in kcal/mol) for the following molecule:\n"
            f"{smiles}\n\n"
            "### Response:\n"
            f"{answer}{eos_token}"
        )
        return full_prompt

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text = self.samples[idx]
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].squeeze(0)
        attention_mask = encodings["attention_mask"].squeeze(0)
        labels = input_ids.clone()

        separator = "### Response:\n"
        parts = text.split(separator)

        if len(parts) >= 2:
            prompt_text = parts[0] + separator
            prompt_encodings = self.tokenizer(
                prompt_text,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            prompt_len = prompt_encodings["input_ids"].shape[1]

            if prompt_len < len(labels):
                labels[:prompt_len] = -100

        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

class OLMO_QLoRA(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(
            "Codemaster67/OLMo-7B-ZINC20-10k",
            trust_remote_code=True,
            padding_side="right"
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.float16
        )

        self.peft_config = LoraConfig(
            r=32,
            lora_alpha=64,
            target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )

    def configure_model(self):
        self.model = AutoModelForCausalLM.from_pretrained(
            "Codemaster67/OLMo-7B-ZINC20-10k",
            quantization_config=self.bnb_config,
            trust_remote_code=True,
        )
        self.model = prepare_model_for_kbit_training(self.model)
        self.model = get_peft_model(self.model, self.peft_config)
        self.model.print_trainable_parameters()

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        self.log("Train_loss", loss, prog_bar=True, on_step=True, on_epoch=True, logger=True)
        return loss


    def on_train_end(self):
        if self.trainer.is_global_zero:
            print("\nStarting test set evaluation (RMSE)...")

            test_dataset = OlmoDataset(mode="test", max_length=300)
            test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

            self.model.eval()

            y_true = []
            y_pred = []

            print(f"Evaluating on {len(test_loader)} samples using Text Generation...")

            with torch.no_grad():
                for i, batch in enumerate(test_loader):
                    batch = {k: v.to(self.device) for k, v in batch.items()}
                    input_ids = batch["input_ids"]
                    labels = batch["labels"]
                    attention_mask = batch["attention_mask"]

                    # Find where the prompt ends (where labels switch from -100 to valid)
                    response_mask = (labels != -100)

                    answer_start_index = response_mask.int().argmax(dim=1).item()

                    # Extract ground truth from the full text
                    # We have to decode the labels to get the true float value
                    valid_label_ids = labels[0][answer_start_index:]
                    valid_label_ids = valid_label_ids[valid_label_ids != -100]
                    true_text = self.tokenizer.decode(valid_label_ids, skip_special_tokens=True)

                    try:
                        true_val = float(true_text.strip())
                    except ValueError:
                        print(f"Skipping sample {i}: Could not parse ground truth '{true_text}'")
                        continue

                    # Prepare prompt only for generation
                    if answer_start_index > 0:
                        prompt_ids = input_ids[:, :answer_start_index]
                        prompt_mask = attention_mask[:, :answer_start_index]
                    else:
                        continue

                    # Generate output
                    # Max new tokens is small because we only expect a number (e.g., "-12.34")
                    outputs = self.model.generate(
                        input_ids=prompt_ids,
                        attention_mask=prompt_mask,
                        max_new_tokens=15,
                        pad_token_id=self.tokenizer.eos_token_id,
                        eos_token_id=self.tokenizer.eos_token_id
                    )

                    # Decode generated part
                    generated_ids = outputs[0][answer_start_index:]
                    generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)

                    # This looks for standard float patterns like -1.23, 0.5, 12, etc.
                    match = re.search(r"[-+]?\d*\.\d+|\d+", generated_text)

                    pred_val = float(match.group())

                    y_true.append(true_val)
                    y_pred.append(pred_val)

                    if i % 20 == 0:
                        print(f"Sample {i}: True={true_val}, Pred={pred_val:.5f}, Raw='{generated_text.strip()}'")

            # Calculate RMSE
            mse = mean_squared_error(y_true, y_pred)
            rmse = sqrt(mse)

            print("\n=== Test Set Metrics ===")
            print(f"RMSE: {rmse:.4f}")

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4, weight_decay=1e-4)

        total_steps = self.trainer.estimated_stepping_batches

        warmup_steps = int(0.15*total_steps)
        scheduler_warmup = LinearLR(optimizer, start_factor=0.001, end_factor=1.0, total_iters=warmup_steps)
        scheduler_cosine = CosineAnnealingLR(optimizer, T_max=total_steps - warmup_steps)
        scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_cosine], milestones=[warmup_steps])
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}

if __name__ == "__main__":
    dataset = OlmoDataset()

    train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

    trainer = pl.Trainer(
            accelerator="gpu",
            devices=1,
            strategy="ddp",
            max_epochs=16,
            precision="16-mixed",
            accumulate_grad_batches=8,
            enable_checkpointing=False,
            gradient_clip_val=0.5,
            log_every_n_steps=100,)

    model = OLMO_QLoRA()

    trainer.fit(model, train_loader)

Writing train.py


In [None]:

!python train.py

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2KEpoch 11/15 [35m━━━[0m[35m╸[0m[90m━━━━━━━━━━━━[0m 64/257 [2m0:01:33 • 0:04:44[0m [2;4m0.68it/s[0m [3mv_num: 0.000     [0m
                                                               [3mTrain_loss_step: [0m
                                                               [3m0.002            [0m
                                                               [3mTrain_loss_epoch:[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2KEpoch 11/15 [35m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m 65/257 [2m0:01:34 • 0:04:41[0m [2;4m0.68it/s[0m [3mv_num: 0.000     [0m
                                                               [3mTrain_loss_step: [0m
                                                               [3m0.008            [0m
                                                               [3mTrain_loss_epoch:[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2KEpoch 11/15 [35m━━━━[0m