In [None]:
# import pandas as pd
# from tqdm import tqdm

# df_polarizability = pd.read_csv('../polygraphpy/data/polarizability_data.csv')

# for _, row in tqdm(df_polarizability.iterrows()):
#     id_val = row['id']
#     chain_size = row['chain_size']
#     value = row['static_polarizability']
    
#     if chain_size == 0:
#         filename = f"xyz_files/monomer_{id_val}.pdb"
#     else:
#         filename = f"xyz_files/homopoly_{id_val}_chain_{chain_size}.pdb"
    
#     try:
#         with open(filename, 'r') as f:
#             content = f.read()
        
#         remark = f"REMARK static_polarizability {value}\n"
#         new_content = remark + content
        
#         with open(filename, 'w') as f:
#             f.write(new_content)
#     except:
#         pass

26582it [00:10, 2598.49it/s]


# Memory issues

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import os

from huggingface_hub import login

class PDBDataset(Dataset):
    def __init__(self, pdb_dir, tokenizer):
        self.tokenizer = tokenizer
        self.data = []
        pdb_files = [f for f in os.listdir(pdb_dir) if f.startswith('monomer_') and f.endswith('.pdb')][:5000]
        print(f'Dataset len: {len(pdb_files)}')
        for file in pdb_files:
            full_path = os.path.join(pdb_dir, file)
            with open(full_path, 'r') as f:
                lines = f.readlines()
                prop_line = next((line for line in lines if line.startswith('REMARK')), None)
                if prop_line:
                    parts = prop_line.split()
                    prop_name = parts[1]
                    prop_value = parts[2]
                    prop = f"{prop_name}: {prop_value}"
                else:
                    prop = "Unknown: 0.0"
                pdb_text = ''.join(lines)
            text = f"Property: {prop}\n{pdb_text}"
            tokens = tokenizer(text, truncation=True, max_length=tokenizer.model_max_length, return_tensors='pt')['input_ids'].squeeze(0)
            self.data.append(tokens.tolist())

    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        input_ids = self.data[idx]
        return {'input_ids': input_ids}

gpt = '/home/jgduarte/Downloads/Llama-3.1-8B'

tokenizer = AutoTokenizer.from_pretrained(gpt)
tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(load_in_4bit=True, llm_int8_enable_fp32_cpu_offload=True, bnb_4bit_quant_type="nf4")
model = AutoModelForCausalLM.from_pretrained(gpt, quantization_config=quant_config, device_map={'': 0})

# Add LoRA adapters
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

dataset = PDBDataset('xyz_files', tokenizer)
dataloader = DataLoader(dataset, batch_size=4)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [2]:
# Train
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=16, fp16=True, gradient_checkpointing=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
trainer = Trainer(model=model, args=training_args, train_dataset=dataset, data_collator=data_collator)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: device() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (torch.device device)
      didn't match because some of the arguments have invalid types: (!NoneType!)
 * (str type, int index)


# Nano GPT

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import math
from tqdm import tqdm
from transformers import AutoTokenizer
import torch.nn.functional as F

# Dataset
class PDBDataset(Dataset):
    def __init__(self, pdb_dir, tokenizer, block_size=1024):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = []
        pdb_files = [f for f in os.listdir(pdb_dir) if f.startswith('monomer') and f.endswith('.pdb')]
        for file in pdb_files:
            full_path = os.path.join(pdb_dir, file)
            with open(full_path, 'r') as f:
                lines = f.readlines()
                prop_line = next((line for line in lines if line.startswith('REMARK')), None)
                if prop_line:
                    parts = prop_line.split()
                    prop_name = parts[1]
                    prop_value = parts[2]
                    prop = f"{prop_name}: {prop_value}"
                else:
                    prop = "Unknown: 0.0"
                pdb_text = ''.join(lines)
            text = f"Property: {prop}\n{pdb_text}<|endoftext|>"
            tokens = self.tokenizer.encode(text, truncation=True, max_length=block_size + 1, return_tensors='pt')[0]
            self.data.append(tokens)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]
        if tokens.size(0) > self.block_size + 1:
            i = torch.randint(0, tokens.size(0) - self.block_size - 1, (1,)).item()
            x = tokens[i:i+self.block_size]
            y = tokens[i+1:i+self.block_size+1]
        else:
            x = tokens[:-1]
            y = tokens[1:]
        pad_id = self.tokenizer.pad_token_id
        ignore = -100
        x = F.pad(x, (0, self.block_size - x.size(0)), value=pad_id)
        y = F.pad(y, (0, self.block_size - y.size(0)), value=ignore)
        return x, y

# NanoGPT Model
class NanoGPT(nn.Module):
    def __init__(self, vocab_size, n_embd=256, n_head=4, n_layer=6, block_size=1024):
        super().__init__()
        self.block_size = block_size
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Parameter(torch.zeros(1, block_size, n_embd))
        self.layers = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, x, targets=None):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.pos_embedding[:, :T, :]
        x = tok_emb + pos_emb
        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = nn.CrossEntropyLoss(ignore_index=-100)(logits, targets)
        return logits, loss

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size, n_embd):
        super().__init__()
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)
        self.proj = nn.Linear(n_embd, n_embd)
        self.n_head = n_head
        self.head_size = head_size
        self.register_buffer('tril', torch.tril(torch.ones(1024, 1024)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)
        q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)
        v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_size))
        att = att.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        att = torch.softmax(att, dim=-1)

        out = att @ v
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.proj(out)

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

# Training
def train():
    pdb_dir = 'xyz_files'
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'eos_token': '<|endoftext|>', 'pad_token': '[PAD]'})
    dataset = PDBDataset(pdb_dir, tokenizer, block_size=1024)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

    model = NanoGPT(len(tokenizer), block_size=1024)
    optimizer = optim.AdamW(model.parameters(), lr=3e-4)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    epochs = 3
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x, y in tqdm(dataloader):
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Loss {total_loss / len(dataloader)}")

    torch.save(model, 'nanogpt_pdb.pth')

# Generation
def generate(model, tokenizer, prompt, max_len=2000, temperature=1.0):
    model.eval()
    tokens = tokenizer.encode(prompt, return_tensors='pt')[0].unsqueeze(0).to(next(model.parameters()).device)
    for _ in range(max_len):
        logits, _ = model(tokens)
        logits = logits[:, -1, :] / temperature
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        tokens = torch.cat((tokens, next_token), dim=1)
        if next_token.item() == tokenizer.eos_token_id:
            break
    return tokenizer.decode(tokens[0])

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
train()

100%|██████████| 4808/4808 [10:23<00:00,  7.71it/s]


Epoch 1: Loss 0.72398286278107


 55%|█████▍    | 2640/4808 [05:40<04:26,  8.12it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'eos_token': '<|endoftext|>', 'pad_token': '[PAD]'})
model = NanoGPT(len(tokenizer), block_size=1024)
model = torch.load('nanogpt_pdb.pth')
prompt = "Property: static_polarizability: 184.754578883\n"
generated = generate(model, tokenizer, prompt)
print(generated)