In [None]:
from datasets import load_dataset
import pandas as pd


ds = load_dataset("maxscha/commitbench")


cols = ["diff", "message"]


ds["train"].select_columns(cols) \
    .to_pandas() \
    .to_csv("commitbench_train.csv", index=False)


ds["validation"].select_columns(cols) \
    .to_pandas() \
    .to_csv("commitbench_validation.csv", index=False)


ds["test"].select_columns(cols) \
    .to_pandas() \
    .to_csv("commitbench_test.csv", index=False)


In [None]:
import pandas as pd
train_df = pd.read_csv("commitbench_train.csv")
import pandas as pd

input_csv = "commitbench_train.csv"
output_csv = "commitbench_train_10pct.csv"

chunk_size = 100_000
keep_frac = 0.10
first = True

for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
    sampled = chunk.sample(frac=keep_frac, random_state=42)
    sampled.to_csv(
        output_csv,
        mode="w",
        index=False,
        header=first
    )
    first = False


In [8]:
import pandas as pd
train_df = pd.read_csv("commitbench_train_10pct.csv") 

In [10]:
def format_input(entry):
    
    input_text = f"{entry['diff']}\nCommit Message:\n{entry['message']}"

    return input_text
print(train_df.iloc[0])
print(format_input(train_df.iloc[0]))


diff       diff --git a/lib/is_translatable.rb b/lib/is_t...
message    Adding some failing specs to get the translati...
Name: 0, dtype: object
diff --git a/lib/is_translatable.rb b/lib/is_translatable.rb
index <HASH>..<HASH> 100644
--- a/lib/is_translatable.rb
+++ b/lib/is_translatable.rb
@@ -13,6 +13,9 @@ module IsTranslatable
 
     def set_translation(kind, t, locale_override=nil)
     end
+
+	def get_translation(kind, locale_override=nil)
+	end
   end
 end
 
diff --git a/spec/lib/is_translatable_spec.rb b/spec/lib/is_translatable_spec.rb
index <HASH>..<HASH> 100644
--- a/spec/lib/is_translatable_spec.rb
+++ b/spec/lib/is_translatable_spec.rb
@@ -24,7 +24,20 @@ describe IsTranslatable do
       context 'with translated title' do
         before {@article.set_translation(:title, @titles[:es])}
         it {should be_valid}
-        it "should check that it's actually translated"
+
+        it {subject.get_translation(:title).should == @titles[:es]}
+
+		context 'loaded from db' do


In [60]:
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token


def token_len(t: str) -> int:
    return len(tokenizer.encode(t, add_special_tokens=True))


rng = np.random.default_rng(42)
texts = train_df.apply(format_input, axis=1).tolist()
sample_size = min(50_000, len(texts))
sample_texts = rng.choice(texts, size=sample_size, replace=False)

lengths = np.array([token_len(t) for t in sample_texts], dtype=np.int32)

p = [50, 75, 90, 95, 97, 98, 99, 99.5]
qs = np.percentile(lengths, p)

for perc, q in zip(p, qs):
    print(f"{perc:>5}%  -> {int(q)} tokens")

# Example choice:
max_len = int(np.percentile(lengths, 95)) 
print("Suggested max_len:", max_len)


Token indices sequence length is longer than the specified maximum sequence length for this model (1054 > 1024). Running this sequence through the model will result in indexing errors


   50%  -> 325 tokens
   75%  -> 430 tokens
   90%  -> 528 tokens
   95%  -> 591 tokens
   97%  -> 636 tokens
   98%  -> 671 tokens
   99%  -> 741 tokens
 99.5%  -> 811 tokens
Suggested max_len: 591


In [11]:
import torch
from torch.utils.data import Dataset
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

class CodeDiffMessageDataset(Dataset):
    def __init__(self, data, tokenizer,max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = 1024

        
    def __getitem__(self, index):
        self.encoded_text = None
        entry = self.data.iloc[index]
        code_diff_plus_message = format_input(entry)
        self.encoded_text = tokenizer.encode(code_diff_plus_message)
        return self.encoded_text

    def __len__(self):
        return len(self.data)

train_dataset = CodeDiffMessageDataset(train_df, tokenizer)



[50256]


In [13]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    device = "cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index


        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [14]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

print(custom_collate_fn(batch))

(tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]]), tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]]))


In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cpu


In [16]:
from torch.utils.data import DataLoader


num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = CodeDiffMessageDataset(train_df, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=custom_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [17]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

Train loader:
torch.Size([8, 510]) torch.Size([8, 510])
torch.Size([8, 572]) torch.Size([8, 572])
torch.Size([8, 694]) torch.Size([8, 694])
torch.Size([8, 511]) torch.Size([8, 511])
torch.Size([8, 544]) torch.Size([8, 544])
torch.Size([8, 561]) torch.Size([8, 561])
torch.Size([8, 553]) torch.Size([8, 553])
torch.Size([8, 560]) torch.Size([8, 560])
torch.Size([8, 496]) torch.Size([8, 496])
torch.Size([8, 390]) torch.Size([8, 390])
torch.Size([8, 460]) torch.Size([8, 460])
torch.Size([8, 628]) torch.Size([8, 628])
torch.Size([8, 567]) torch.Size([8, 567])
torch.Size([8, 384]) torch.Size([8, 384])
torch.Size([8, 656]) torch.Size([8, 656])
torch.Size([8, 615]) torch.Size([8, 615])
torch.Size([8, 439]) torch.Size([8, 439])
torch.Size([8, 591]) torch.Size([8, 591])
torch.Size([8, 687]) torch.Size([8, 687])
torch.Size([8, 460]) torch.Size([8, 460])
torch.Size([8, 542]) torch.Size([8, 542])
torch.Size([8, 699]) torch.Size([8, 699])
torch.Size([8, 552]) torch.Size([8, 552])
torch.Size([8, 665])

In [18]:

print(inputs[5])

tensor([26069,  1377, 18300,   257,    14, 10677,    14, 19411,    14,  8610,
        44526,  5653, 11792,    13, 12355,   275,    14, 10677,    14, 19411,
           14,  8610, 44526,  5653, 11792,    13, 12355,   198,  9630,  1279,
           39, 11211,    29,   492,    27,    39, 11211,    29,  1802, 38172,
          198,  6329,   257,    14, 10677,    14, 19411,    14,  8610, 44526,
         5653, 11792,    13, 12355,   198, 45340,   275,    14, 10677,    14,
        19411,    14,  8610, 44526,  5653, 11792,    13, 12355,   198, 12404,
          532, 23188,    11,    22,  1343, 23188,    11,    22, 25248,  1171,
         1398,  6458, 44526,  5653, 11792, 14582, 21119, 10071, 37233,  1391,
          198,   220,   198,   220,   198,   220,   220,   220,   220,  3373,
        47649,  3299, 33252,  1398,   326, 17105,   262,  6459,   422,   262,
         4382,   198,    12,   220,   220,   220,  2839,  1398, 18628, 47649,
         3299, 33252, 23986, 48191, 33252,  1391,   198,    10, 

In [77]:

print(targets[5])

tensor([ 1377, 18300,   257,    14, 10677,    14, 19411,    14,  8610, 44526,
         5653, 11792,    13, 12355,   275,    14, 10677,    14, 19411,    14,
         8610, 44526,  5653, 11792,    13, 12355,   198,  9630,  1279,    39,
        11211,    29,   492,    27,    39, 11211,    29,  1802, 38172,   198,
         6329,   257,    14, 10677,    14, 19411,    14,  8610, 44526,  5653,
        11792,    13, 12355,   198, 45340,   275,    14, 10677,    14, 19411,
           14,  8610, 44526,  5653, 11792,    13, 12355,   198, 12404,   532,
        23188,    11,    22,  1343, 23188,    11,    22, 25248,  1171,  1398,
         6458, 44526,  5653, 11792, 14582, 21119, 10071, 37233,  1391,   198,
          220,   198,   220,   198,   220,   220,   220,   220,  3373, 47649,
         3299, 33252,  1398,   326, 17105,   262,  6459,   422,   262,  4382,
          198,    12,   220,   220,   220,  2839,  1398, 18628, 47649,  3299,
        33252, 23986, 48191, 33252,  1391,   198,    10, 15211, 

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(model_name)

model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = False  # important for training





  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(model.config)

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dtype": "float32",
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.57.1",
  "use_cache": false,
  "vocab_size": 50257
}



In [1]:
import torch
import torch.nn as nn
import time
from transformers import pipeline

def train_one_epoch(model, dataloader, optimizer, device,freq_print):
    model.train()
    total_loss = 0
    
    for i, (x, y) in enumerate(dataloader):
        if True:
            print(f"steps:{i+1}/{len(dataloader)}")
            sequence = "The quick brown fox jumps over the lazy\nCommit Message:\n"

# Encode the sequence into tokens the model can understand
            inputs = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0).to(device)  # Batch size 1

# Generate text using the model
            outputs = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

# Decode the generated tokens back into human-readable text
            text = tokenizer.decode(outputs[0].tolist())

# Print the generated text
            print(text)
        #if i % 100 == 0 and i > 0:
        #    time.sleep(5)
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        torch.cuda.empty_cache()

        logits = model(x).logits                    # (B, T, vocab)
        loss = nn.CrossEntropyLoss()( 
            logits.view(-1, logits.size(-1)), 
            y.view(-1)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)

            logits = model(x).logits
            loss = nn.CrossEntropyLoss()( 
                logits.view(-1, logits.size(-1)),
                y.view(-1)
            )
            total_loss += loss.item()

    return total_loss / len(dataloader)


def train(model, train_loader, optimizer, device):

    # -----------------------------
    # Training Loop
    # -----------------------------
    best_val_loss = float("inf")
    patience = 2            # stop after 2 bad epochs
    bad_epochs = 0
    epochs = 2
    train_losses, val_losses = [],[]
    for epoch in range(epochs):
        
        train_loss = train_one_epoch(model, train_loader, optimizer, device,freq_print=1)
        #val_loss = evaluate(model, val_loader, device)
        train_losses.append(train_loss)
        #val_losses.append(val_loss)
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss:   {val_loss:.4f}")
        torch.save(model.state_dict(), f"models/gpt2_epoch{epoch+1}.pt")
        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            bad_epochs = 0
            torch.save(model.state_dict(), "models/best_model.pt")
            print("New best model saved!")
        else:
            bad_epochs += 1
            print(f"No improvement (bad epochs: {bad_epochs})")

        if bad_epochs >= patience:
            print("EARLY STOPPING TRIGGERED.")
            break

    epochs_tensor = torch.linspace(0, epochs, len(train_losses))
    for i,(val_loss,train_loss)  in enumerate(zip(val_losses,train_losses)):
        print(f"Epoch {i+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
   
   
   
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)   
train(model, train_loader, optimizer, device)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'model' is not defined