In [1]:
from datasets import load_dataset
import torch
from dotenv import load_dotenv
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os
from huggingface_hub import login

In [2]:
load_dotenv()
login_token = os.getenv('LOGIN_TOKEN')
login(login_token)

In [3]:
ds = load_dataset("Abdulrhman37/metallurgy-qa")

Found cached dataset json (/home/ykandik/.cache/huggingface/datasets/Abdulrhman37___json/Abdulrhman37--metallurgy-qa-19dc6936f18e311d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
model_path = 'google/gemma-3-1b-pt'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)

2025-03-25 11:53:29.702557: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-03-25 11:53:29.702582: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
def process_text(example):
     # return [
     #     [{'role': "user", 
     #       'content': [{"type": "text", "text": example['instruction']},]
     #      },
     #     {'role': "assistant", "content": [{"type": "text", "text": example['output']}, ]
     #     }
     #     ]
     # ]
    return f"question: {example['instruction']} \n response: {example['output']}"
def tokenize_function(examples):
    text = process_text(examples)
    inputs = tokenizer(text,
    return_tensors="pt", padding='max_length', max_length=1024, truncation=True)
    return inputs

In [6]:
train_test_split = ds["train"].train_test_split(test_size=0.3)
train_ds = train_test_split['train']
test_ds = train_test_split['test'].train_test_split(test_size=0.4)
val_ds = test_ds['test']
test_ds = test_ds['train']
train_ds = train_ds.map(lambda x: {"inputs": tokenize_function(x)})
val_ds = val_ds.map(lambda x: {"inputs": tokenize_function(x)})

Map:   0%|          | 0/3648 [00:00<?, ? examples/s]

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

In [7]:
class TextDataset(Dataset):

    def __init__(self, ds):
        self.ds = ds
        self.samples = ds.shape[0]

    def __getitem__(self, idx):
        tokens = self.ds[idx]['inputs']
        return {'input_ids': torch.tensor(tokens['input_ids']), 
               'attention_mask': torch.tensor(tokens['attention_mask'])}
        
    def __len__(self):
        return self.samples

In [8]:
train_dataset = TextDataset(train_ds)
val_dataset = TextDataset(val_ds)
test_dataset = TextDataset(test_ds)

In [9]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)
test_dataset = DataLoader(test_dataset, batch_size=2)

In [10]:
def get_validation_loss(val_loader, model):
    model.eval()
    with torch.no_grad():
        val_loss = []
        for dic in val_loader:
            input_ids = dic['input_ids'].squeeze(1).to(model.device)
            attention_mask = dic['attention_mask'].squeeze(1).to(model.device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss.append(outputs.loss)
        return sum(val_loss)/len(val_loss)

In [11]:
class EarlyStopper():
    def __init__(self, patience=15, min_delta=0.01):
        self.patience = patience
        self.min_delta = min_delta
        self.min_validation_loss = float('inf')
        self.counter = 0


    def early_stop(self, model, validation_loss, path):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            print('Saving model') 
            torch.save(model.state_dict(), f'{path}.pth')
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [12]:
def train(train_loader, val_loader, model, train_params, path='model'):
    stopper = EarlyStopper()
    epochs = train_params['epochs']
    optimizer = train_params['optimizer']
    val_loss = None
    train_loss = None
    for epoch in range(epochs):
        loss_at_epoch = []
        for dic in tqdm(train_loader, desc=f'Epoch:{epoch}/{epochs} train_loss: {train_loss} val_loss: {val_loss}'):
            input_ids = dic['input_ids'].squeeze(1).to(model.device)
            attention_mask = dic['attention_mask'].squeeze(1).to(model.device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss_val = outputs.loss
            loss_at_epoch.append(loss_val)
            optimizer.zero_grad()
            loss_val.backward()
            optimizer.step()
        train_loss = sum(loss_at_epoch)/len(loss_at_epoch)
        val_loss = get_validation_loss(val_loader, model)
        stop = stopper.early_stop(model, val_loss, path)
        if stop:
            print(f'Stopping early at epoch {epoch}')
            break

In [13]:
train_params = {'epochs': 0, 'optimizer': optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)}

In [14]:
for name, weights in model.named_parameters():
    if 'layers' in name:
        name_toks = name.split('.')
        num = int(name_toks[2])
        if num < 10:
            weights.requires_grad = False  # keeping the first 10 layers frozen

In [15]:
train(train_loader, val_loader, model, train_params)

Epoch:0/5 train_loss: None val_loss: None: 100%|██████████████████████████████| 1824/1824 [06:34<00:00,  4.62it/s]


Saving model


Epoch:1/5 train_loss: 0.7636858224868774 val_loss: 0.5677177906036377: 100%|██| 1824/1824 [06:35<00:00,  4.62it/s]


Saving model


Epoch:2/5 train_loss: 0.5057775974273682 val_loss: 0.528381884098053: 100%|███| 1824/1824 [06:37<00:00,  4.59it/s]


Saving model


Epoch:3/5 train_loss: 0.4105050563812256 val_loss: 0.5251997709274292: 100%|██| 1824/1824 [06:39<00:00,  4.57it/s]


Saving model


Epoch:4/5 train_loss: 0.3305712342262268 val_loss: 0.5130376219749451: 100%|██| 1824/1824 [06:37<00:00,  4.59it/s]


In [16]:
model.load_state_dict(torch.load('./model.pth', weights_only=True))

<All keys matched successfully>

In [18]:
def process_text(example):
    return f"question: {example['instruction']} \n response: "

In [19]:
test_ds = test_ds.map(lambda x: {"inputs": tokenize_function(x)})

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

In [None]:
model.eval()
with torch.no_grad():
    out_tokens = model.generate(inputs=torch.tensor(test_ds['inputs'][2]['input_ids'].to(model.device)), attention_mask=torch.tensor(test_ds['inputs'][2]['attention_mask'].to(model.device)), max_new_tokens=10)



In [None]:
print(tokenizer.decode(out_tokens[0], skip_special_tokens=True))

In [None]:
test_ds['output'][2]