In [1]:
!pip install evaluate rouge_score accelerate -U # accelerate -U is needed for seq2seq training

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downl

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

# from datasets import Dataset

In [2]:
texts = [
    {
        # https://github.com/pydantic/pydantic/pull/9111
        "code": """
Pull Request Diff:
```
diff --git a/pydantic/_internal/_fields.py b/pydantic/_internal/_fields.py
index 3f78a920d5..6e5e933061 100644
--- a/pydantic/_internal/_fields.py
+++ b/pydantic/_internal/_fields.py
@@ -177,7 +177,7 @@ def collect_model_fields(  # noqa: C901
             )

         # when building a generic model with `MyModel[int]`, the generic_origin check makes sure we don't get
-        # "... shadows an attribute" errors
+        # "... shadows an attribute" warnings
         generic_origin = getattr(cls, '__pydantic_generic_metadata__', {}).get('origin')
         for base in bases:
             dataclass_fields = {
#  Do not warn about shadowed fields if they are not redefined in a child class
```
""",
        "description": """
        # Change Summary

        Adds another early exit condition when evaluating whether to log a warning message during detection for shadowed fields.

In the case where a field is defined in a parent class, but it has not been defined at all in a child class, it is technically not a shadowed field, and so shouldn't be warned as such.

Note this is very different from the case where a child class does redefine a field but with a narrower type or even defined as the same type but with a different default. Conceptually this is probably ok, but checking for that is quite complex and this PR does not attempt to try. So this is about checking if a field is defined or not defined - if it is, regardless of type or default value, the warning message is still logged.
        """,
    },
    {
        # https://github.com/pydantic/pydantic/pull/9144
        "code": """
        --- a/pydantic/main.py
+++ b/pydantic/main.py
@@ -222,9 +222,12 @@ def model_construct(cls: type[Model], _fields_set: set[str] | None = None, **val
         fields_set = set()

         for name, field in cls.model_fields.items():
-            if field.alias and field.alias in values:
+            if field.alias is not None and field.alias in values:
                 fields_values[name] = values.pop(field.alias)
                 fields_set.add(name)
+            elif field.validation_alias is not None and field.validation_alias in values:
+                fields_values[name] = values.pop(field.validation_alias)
+                fields_set.add(name)
             elif name in values:
                 fields_values[name] = values.pop(name)
                 fields_set.add(name)
        """,
        "description": """
        # Change Summary

        just like you can construct a model using a field alias, this PR fixes constructing a model using validation_alias.
        """,
    },
    {
        "code": """
        diff --git a/pydantic/json_schema.py b/pydantic/json_schema.py
index 9f0ceb3e36..3e63ecc08d 100644
--- a/pydantic/json_schema.py
+++ b/pydantic/json_schema.py
@@ -751,6 +751,8 @@ def literal_schema(self, schema: core_schema.LiteralSchema) -> JsonSchemaValue:
             result['type'] = 'boolean'
         elif types == {list}:
             result['type'] = 'array'
+        elif types == {type(None)}:
+            result['type'] = 'null'
         return result

     def enum_schema(self, schema: core_schema.EnumSchema) -> JsonSchemaValue:
        """,
        "description": """
        # Change Summary

        This PR aims to complete #8944 and #8905 by also handling null types when generating a json-schema from a pydantic model.

For instance, the following model:
```python
from pydantic import BaseModel
from typing import Literal


class Foo(BaseModel):
    bar: Literal["Bar"] = 'Bar'
    baz: Literal[None] = None
    foo: str = 'Foo'
```
leads to:
```
{'properties': {'bar': {'const': 'Bar', 'default': 'Bar', 'enum': ['Bar'], 'title': 'Bar', 'type': 'string'}, 'baz': {'const': None, 'default': None, 'enum': [None], 'title': 'Baz', 'type': 'null'}, 'foo': {'default': 'Foo', 'title': 'Foo', 'type': 'string'}}, 'title': 'Foo', 'type': 'object'}
```
        """,
    },
]

In [42]:
class CodeDescriptionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

        # Prepare the dataset
        self.inputs = []
        self.targets = []

        for entry in texts:
            # Tokenize input and target texts
            input_encodings = tokenizer(entry["code"], truncation=True, max_length=max_length, padding="max_length")
            target_encodings = tokenizer(
                entry["description"], truncation=True, max_length=max_length, padding="max_length"
            )

            self.inputs.append(torch.tensor(input_encodings["input_ids"]))
            self.targets.append(torch.tensor(target_encodings["input_ids"]))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]
        target_ids = self.targets[idx]

        # We can create a mask for the targets with -100 (ignored by loss functions in HuggingFace) where the input is padding
        target_mask = (target_ids != self.tokenizer.pad_token_id).long()

        return {
            "input_ids": input_ids,
            "labels": target_ids * target_mask
            + (tokenizer.pad_token_id * (1 - target_mask)),  # Mask out pad tokens in labels
        }  # 50257 -- [PAD] token

In [43]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [44]:
# tokenizer("[PAD]")["input_ids"][0] # 50257

In [45]:
# dataset = CodeDescriptionDataset(texts, tokenizer)
# train_dataset, val_dataset = random_split(dataset, [2, 2])

In [46]:
train_dataset = CodeDescriptionDataset(texts, tokenizer)
eval_dataset = CodeDescriptionDataset(texts, tokenizer)

In [60]:
import evaluate

metrics_name = ["bleu", "rouge", "exact_match"]
for name in metrics_name:
    evaluate.load(name)
metrics = evaluate.combine(metrics_name)
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
exact_match = evaluate.load("exact_match")


def compute_metrics(eval_pred):
    predictions, label_ids = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = [
        tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=True) for pred in predictions
    ]
    decoded_labels = [
        tokenizer.decode(label, skip_special_tokens=True, clean_up_tokenization_spaces=True) for label in label_ids
    ]

    # Prepare data for BLEU
    # formatted_predictions = [pred.strip() for pred in decoded_preds]
    # formatted_references = [ref.strip() for ref in decoded_labels]  # BLEU expects list of list of references for each prediction

    return metrics.compute(predictions=decoded_preds, references=decoded_labels)

In [61]:
training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=5,  # total number of training epochs
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=0.01,               # strength of weight decay
    # logging_dir='./logs',            # directory for storing logs
    logging_steps=1,
    evaluation_strategy="epoch",  # perform evaluation each epoch
    # eval_steps=2000,
    # # per_device_train_batch_size=8,
    # # per_device_eval_batch_size=64,
    # auto_find_batch_size=True,
    # save_strategy="epoch",
    # lr_scheduler_type=SchedulerType.COSINE_WITH_RESTARTS,
    # predict_with_generate=True,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length,Rouge1,Rouge2,Rougel,Rougelsum,Exact Match
1,27.2743,60.178959,0.0,"[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]",0.037945,0.234104,81,346,0.042693,0.0,0.023645,0.042693,0.0
2,28.9275,60.178959,0.0,"[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]",0.037945,0.234104,81,346,0.042693,0.0,0.023645,0.042693,0.0
3,28.128,60.178959,0.0,"[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]",0.037945,0.234104,81,346,0.042693,0.0,0.023645,0.042693,0.0
4,26.431,60.178959,0.0,"[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]",0.037945,0.234104,81,346,0.042693,0.0,0.023645,0.042693,0.0
5,27.5333,60.178959,0.0,"[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]",0.037945,0.234104,81,346,0.042693,0.0,0.023645,0.042693,0.0


Trainer is attempting to log a value of "[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.25925925925925924, 0.07692307692307693, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this

TrainOutput(global_step=5, training_loss=27.658814239501954, metrics={'train_runtime': 5.4833, 'train_samples_per_second': 2.736, 'train_steps_per_second': 0.912, 'total_flos': 3919380480000.0, 'train_loss': 27.658814239501954, 'epoch': 5.0})