Install the correct dependencies on HuggingFace transformer and ternsorflow

In [59]:
# # We won't need TensorFlow here
# !pip uninstall -y tensorflow
# # Install `transformers` from master
# !pip install git+https://github.com/huggingface/transformers
# !pip list | grep -E 'transformers|tokenizers'
# !pip install nlp==0.2.0
# !pip install datasets
# !pip install git+https://github.com/huggingface/nlp
# 
# # transformers version at notebook update --- 2.11.0
# # tokenizers version at notebook update --- 0.8.0rc1

Fetch datasets

In [60]:
import os
import tokenize
import dis
import sys
import re
import keyword
import pandas as pd
import ast
import torch
import signal
from functools import wraps

def multireplace(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str
    """
    # If case insensitive, we need to normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    if ignore_case:
        def normalize_old(s):
            return s.lower()
        re_mode = re.IGNORECASE
    else:
        def normalize_old(s):
            return s
        re_mode = 0

    replacements = {normalize_old(key): val for key, val in replacements.items()}
    
    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)
    
    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)
    
    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)


def convert(file, output_file):
    with open (file, "r") as f:
        text = f.read()  

    replacements = {}
    for node in ast.iter_child_nodes(ast.parse(text)):
        if isinstance(node, ast.ImportFrom):
            replacements.update({node.module: 'MODULE'})
        if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
            for i, v in enumerate(node.names):
                if(node.names[i].asname):
                    replacements.update({node.names[i].name: 'LIB'})                
                    replacements.update({node.names[i].asname: 'ALIAS'})
                else:
                    replacements.update({node.names[i].name: 'LIBRARY'})


    # reomve * from the dictionary (handle from module import * statement)
    replacements.pop('*', None)
    print('List of modules and libraries to replace:\n', replacements)

    with open('med.py','w') as f:
        f.write(multireplace(text, replacements, ignore_case = True))

    file = 'med.py'
    with open(file,'rb') as f:
        tokens = list(tokenize.tokenize(f.readline))
        
    ### extract important data from the output of tokenize package
    toks = pd.DataFrame(columns = ['original','type','text', 'line','pos'])

    last_line = 0
    last_pos = 0

    for token in tokens:
        
        tok_org = token.string
        tok_text = token.string    
        tok_type = str(token).split('(')[2].split(')')[0]

        # convert keywords to upper
        if keyword.iskeyword(tok_text):
            tok_type = str.upper(tok_text)
        
        #extract operations
        # if tok_type == 'OP':
        #     tok_type = tok_text


        # getting rid of comments and empty lines
        if tok_type in ['NL','NEWLINE','COMMENT']:
            continue
        
        #retrieve the position
        tok_line = token.start[0]
        
        if last_line == tok_line:
            last_pos +=  1
        else:
            last_pos = 1
        tok_pos = last_pos
        last_line = tok_line
        
        new_row = pd.DataFrame([{'type':tok_type,
                         'original':tok_org,
                         'text':tok_text,
                         'line':tok_line,
                         'pos':tok_pos}])
        
        toks = pd.concat([toks, new_row], ignore_index=True)



    # remove encoding lines and end of file
    toks.line = toks.line.astype('int')
    toks.pos = toks.pos.astype('int')
    toks = toks.loc[~((toks.type == 'ENCODING') | (toks.type == 'ENDMARKER'))]
    toks['doc'] = (toks.text.str.contains('"""') | toks.text.str.contains("'''"))
    toks = toks.loc[~(toks.doc)].drop(['doc'],axis=1)

    toks.head(20)

    indent = 0
    last_line = 0

    for index,row in toks.iterrows():
        if row.type == "INDENT":
            indent +=1
            continue
        if row.type == "DEDENT":
            indent -=1
            continue
        if row.line != last_line:
            last_line = row.line            
            new_row = pd.DataFrame([{'type':'\n'+indent*'\t',
                                'text':'\n'+indent*'\t',
                                'line':row.line,
                                'pos':row.pos-1}])
            toks = pd.concat([toks, new_row], ignore_index=True)


    toks = toks.loc[~((toks.type=='INDENT') | (toks.type=='DEDENT'))]
    toks = toks.sort_values(['line','pos']).reset_index(drop=True)


    # drop the first row (empty line)
    toks.drop(toks.index[:1], inplace=True)

    toks.head(20)

    with open(file,'r') as f:
        src = f.read()

    stdout_backup = sys.stdout
    sys.stdout = open('dis.txt','w')
    dis.dis(src)
    sys.stdout = stdout_backup

    with open('dis.txt','r') as f:
        lines = f.readlines()

    # find global variables
    glbls = [].copy()    
    for l in lines:
        clean = l.replace('>>',' ').strip().split()
        if len(clean):
            try:
                int(clean[1])
                line = int(clean[0])
            except:
                clean = [str(line)]+clean
            if 'LOAD_GLOBAL' in clean:
                print('found a global!')
                glbls.append((int(clean[0]),clean[-1].replace('(','').replace(')','')))

    for l,n in glbls:
        toks.loc[(toks.line==l) & (toks.text==n),'type'] = 'GLOBAL_VARIABLE'

    toks .head(10) 

    text_imports = ' '.join(list(toks.text)).replace('\n ','\n').replace(' \n','\n').replace('\t ','\t').replace(' . ','.').replace(' (','(')
    text_imports = multireplace(text_imports, replacements, ignore_case = True)

    with open('normalized_textual_file.py','w') as f:
        f.write(text_imports)

    toks.type = toks.apply(lambda x: x['text'] if str(x['text']) in ['LIBRARY','LIB','ALIAS','MODULE'] else x['type'], axis = 1)
    code_converted = ' '.join(list(toks.type)).replace('\n ','\n').replace(' \n','\n').replace('\t ','\t').replace(' . ','.').replace(' (','(')

    final_replacements = {'GLOBAL_VARIABLE(':'FUNCTION_CALL(',                      
    #                       'NAME.NAME':'NAME',
                          'NAME(':'FUNCTION_CALL(',
                          'NAME':'LOCAL_VARIABLE'}

    code_converted = multireplace(code_converted, final_replacements, ignore_case = False)

    with open(output_file,'w') as f:
        f.write(code_converted)


WEIGHT_MATRIX = {
        'NUMBER' : [1.625, 1.25, 1.125],
        'NAME' : [1.625, 1.125, 1.5],
        'LOCAL_VARIABLE' : [1.625, 1.125, 1.5],
        'FUNCTION_NAME' : [1.625, 1.25, 1.5]
    }


input_file = "/tmp/input_file.txt"
output_file = "/tmp/output_file.txt"


def reranking_layer(outputs, context, tokenizer):
  with open(input_file, 'w') as f:
    f.write(context);
  
  convert(file_path=input_file, output_file=output_file)
  with open(output_file, 'rb') as context:
    inputs = list(zip(tokenizer(input_file), tokenizer(output_file)))
    for item in inputs:
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(WEIGHT_MATRIX[item[1]]))


In [61]:
convert("../dataset/sample_data/data/peakfinder.py", 
        "../dataset/sample_data/data/converted_train.txt")

List of modules and libraries to replace:
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!


In [62]:
# pretrain dataset
#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip
#!unzip 'pretrain_dataset.zip'

# converted dataset
#! wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/converted_dataset.zip
#! unzip 'converted_dataset.zip'

# test dataset
#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/finetune_eval_dataset.zip
#!unzip 'finetune_eval_dataset.zip'

Train a customised python byte-level Byte-pair encoding tokenizer. 

In [63]:
from pathlib import Path
from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling
import glob
import random 

# tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("../local_gpt2_tokenizer/", local_files_only=True)

In [64]:
# ../dataset/sample_data/data/peakfinder.py

paths = [str(x) for x in Path(".").glob("../dataset/sample_data/data/*.py")]
converted_paths = []
for path in paths:
  # converted_path = "../dataset/sample_data/converted/"+ path.split("/").pop().split(".")[0] + ".txt"
  converted_path = "../dataset/sample_data/converted/"+ Path(path).stem + ".txt"
  print(converted_path)
  try:
    convert(path, converted_path)
    converted_paths.append(converted_path)
  except:
    pass

with open("./train.txt", "wb") as train_outfile:
  with open("./test.txt", "wb") as test_outfile:
    print("Here is train.txt && test.txt!")
    for f in paths:
        choice = random.random()
        with open(f, "rb") as infile:
            if choice > 0.1:
              train_outfile.write(infile.read())
            else:
              test_outfile.write(infile.read())

with open("./converted_train.txt", "wb") as train_outfile:
  with open("./converted_test.txt", "wb") as test_outfile:
    print("Here is converted_train.txt && converted_test.txt!")
    for f in converted_paths:
        choice = random.random()
        with open(f, "rb") as infile:
            if choice > 0.1:
              train_outfile.write(infile.read())
            else:
              test_outfile.write(infile.read())


../dataset/sample_data/converted/peakfinder.txt
List of modules and libraries to replace:
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
found a global!
Here is train.txt && test.txt!
Here is converted_train.txt && converted_test.txt!


In [65]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=32) # 128->32
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset("./train.txt", "./test.txt", tokenizer)

with open("./train.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    print("First 5 lines of train.txt:")
    print(lines[:5])
print(f"Train dataset size: {len(train_dataset)}")

converted_train_dataset, converted_test_dataset, converted_datacollator = load_dataset("./converted_train.txt", "./converted_test.txt", tokenizer)

#pretrain_raw_files = glob.glob("./pretrain_dataset" + '/**/*.py', recursive=True)
#pretrain_converted_files = glob.glob("./pretrain_converted_dataset" + '/**/*.py', recursive=True)

First 5 lines of train.txt:
['# Licensed under a 3-clause BSD style license - see LICENSE.rst\n', '"""\n', 'This module provides tools for finding local peaks in an astronomical\n', 'image.\n', '"""\n']
Train dataset size: 116


In [66]:
tokenizer("for i in range(10)")["input_ids"]

[1640, 1312, 287, 2837, 7, 940, 8]

In [67]:
import numpy as np
import torch
import torch.nn as nn
import transformers
# import nlp
import logging
# from datasets import load_dataset
from transformers import TextDataset,DataCollatorForLanguageModeling


logging.basicConfig(level=logging.INFO)

dataset_dict = {
    "token": train_dataset,
    "token_type": train_dataset,
    "line": train_dataset,
}

print(dataset_dict["token"])


<transformers.data.datasets.language_modeling.TextDataset object at 0x000002C2513E4280>


In [68]:
from transformers.utils.dummy_pt_objects import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel


class MultitaskModel(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    def _get_models(self):
      return self.taskmodels_dict

    @classmethod
    def create(cls, model_name, model_type_dict, model_config_dict):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models. 

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            # model = model_type.from_pretrained( "gpt2",
            #     config=model_config_dict[task_name],
            # )
            model = model_type.from_pretrained(
                "../local_gpt2_tokenizer",
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                shared_encoder = cls.get_encoder(model)
            else:
                setattr(model, "encoder", shared_encoder)
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)
    

    @classmethod
    def get_encoder(cls, model):
        """
        The encoder transformer is named differently in each model "architecture".
        This method lets us get the name of the encoder attribute
        """
        model_class_name = model.__class__.__name__
        if model_class_name.startswith("Roberta"):
            return "roberta-base"
        elif model_class_name.startswith("GPT2"):
            config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) 
            encoder_decoder = EncoderDecoderModel(config=config)
            return encoder_decoder.config.encoder
        else:
            raise KeyError(f"Add support for new model {model_class_name}")
    
    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)

In [69]:
# model_name = "gpt2"
# multitask_model = MultitaskModel.create(
#     model_name=model_name,
#     model_type_dict={
#         "token": transformers.AutoModelWithLMHead,
#         "token_type": transformers.AutoModelWithLMHead,
#         "line": transformers.AutoModelForSequenceClassification,
#     },
#     model_config_dict={
#         "token": transformers.AutoConfig.from_pretrained(model_name),
#         "token_type": transformers.AutoConfig.from_pretrained(model_name),
#         "line": transformers.AutoConfig.from_pretrained(model_name),
#     },
# )

In [70]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoConfig

# 设置本地模型路径
local_model_path = "../local_gpt2_tokenizer"

multitask_model = MultitaskModel.create(
    model_name=local_model_path,  # 使用本地路径代替预训练模型名称
    model_type_dict={
        "token": AutoModelForCausalLM,  # 用于生成任务（如GPT-2）
        "token_type": AutoModelForCausalLM,  # 用于token类型的生成任务
        "line": AutoModelForSequenceClassification,  # 用于分类任务
    },
    model_config_dict={
        "token": AutoConfig.from_pretrained(local_model_path),  # 从本地路径加载配置
        "token_type": AutoConfig.from_pretrained(local_model_path),  # 从本地路径加载配置
        "line": AutoConfig.from_pretrained(local_model_path),  # 从本地路径加载配置
    },
)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ../local_gpt2_tokenizer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
# Check that we have a GPU
!nvidia-smi
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

Thu Dec 19 18:28:35 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.76                 Driver Version: 551.76         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   54C    P8             18W /  120W |    5893MiB /   6144MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

True

In [72]:
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import DataCollatorForLanguageModeling, InputDataClass, DefaultDataCollator
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict
from transformers import Trainer
from random import random


class NLPDataCollator(DataCollatorForLanguageModeling):
    """
    Extending the existing DataCollator to work with NLP dataset batches
    """
    def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
          # NLP data sets current works presents features as lists of dictionary
          # (one per example), so we  will adapt the collate_batch logic for that
          if "labels" in first and first["labels"] is not None:
              if first["labels"].dtype == torch.int64:
                  labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
              else:
                  labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)
              batch = {"labels": labels}
          for k, v in first.items():
              if k != "labels" and v is not None and not isinstance(v, str):
                  batch[k] = torch.stack([f[k] for f in features])
          return batch
        else:
          # otherwise, revert to using the default collate_batch
          return DefaultDataCollator().collate_batch(features)


class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """
    def to(self, device):
        return self

class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)
    
    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader) 
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) 
            for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.

        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader) 
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name]) 

class MultitaskTrainer(transformers.Trainer):

    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        
        train_sampler = (
            RandomSampler(train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
              train_dataset,
              batch_size=self.args.train_batch_size,
              sampler=train_sampler
            ),
        )

        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each 
        task Dataloader
        """
        return MultitaskDataloader({
            task_name: self.get_single_train_dataloader(task_name, task_dataset)
            for task_name, task_dataset in self.train_dataset.items()
        })
    
    def train(self):
      # config = transformers.AutoConfig.from_pretrained("gpt2")
      # model = transformers.AutoModelWithLMHead.from_pretrained("gpt2", config=config)
      config = transformers.AutoConfig.from_pretrained("../local_gpt2_tokenizer")
      model = transformers.AutoModelWithLMHead.from_pretrained("../local_gpt2_tokenizer", config=config)
      trainer = Trainer(
        model=model,
        args=transformers.TrainingArguments(
          output_dir="./models/multitask_model",
          overwrite_output_dir=True,
          learning_rate=1e-5,
          do_train=True,
          num_train_epochs=100,
          # Adjust batch size if this doesn't fit on the Colab GPU
          per_device_train_batch_size=8,  
          per_device_eval_batch_size=8,
          save_steps=3000,
        ),
        data_collator=data_collator,
        train_dataset=train_dataset,
      )
      trainer.train()

    def compute_loss(self, model, inputs, return_outputs=True):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [73]:
trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="./models/multitask_model",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        num_train_epochs=1, #100
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=8,  
        # per_device_eval_batch_size=8,
        save_steps=3000,
    ),
    data_collator=data_collator,
)



In [74]:
# trainer.train()

In [77]:
# preds_dict = {}
# for task_name in ["token", "token_type", "line"]:
#   eval_dataloader = DataLoaderWithTaskname(
#       task_name,
#       trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])
#   )
# 
#   print(f"Eval DataLoader batch_size: {eval_dataloader.batch_size}")
#   print(eval_dataloader.data_loader.collate_fn)
# 
# 
#   preds_dict[task_name] = trainer.prediction_loop(
#       eval_dataloader, 
#       description=f"Validation: {task_name}",
#   )
# 
# 
# print(preds_dict)

Eval DataLoader batch_size: None
<transformers.trainer_utils.RemoveColumnsCollator object at 0x000002C20309A490>


ValueError: Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size.

In [83]:
from torch.utils.data import DataLoader

# 自定义 DataLoaderWithTaskname 的封装逻辑
class DataLoaderWithTaskname:
    def __init__(self, task_name, data_loader):
        """
        包装一个 DataLoader，同时为每个批次添加任务名称
        """
        self.task_name = task_name
        self.data_loader = data_loader

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            if not isinstance(batch, dict):
                raise ValueError(f"Batch must be a dictionary, but got {type(batch)}.")
            # 直接添加字符串而非张量，以避免哈希冲突
            batch["task_name"] = self.task_name
            yield batch

    @property
    def batch_size(self):
        """
        从内部的 data_loader 中获取 batch_size
        """
        return self.data_loader.batch_size

def adjust_tensor_shape(tensor, target_shape):
    """
    调整 tensor 的形状以匹配目标形状，必要时进行截断或填充
    """
    current_shape = tensor.shape
    if current_shape == target_shape:
        return tensor
    elif len(current_shape) == len(target_shape):
        slices = tuple(slice(0, min(c, t)) for c, t in zip(current_shape, target_shape))
        padded = torch.zeros(target_shape, dtype=tensor.dtype, device=tensor.device)
        padded[:current_shape[0], :current_shape[1], :current_shape[2]] = tensor[slices]
        return padded
    else:
        raise ValueError(f"Cannot adjust tensor from shape {current_shape} to {target_shape}")

preds_dict = {}

# 遍历任务列表，生成评估 DataLoader 并预测
for task_name in ["token", "token_type", "line"]:
    # 获取 Trainer 的 DataLoader
    trainer_dataloader = trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])

    # 手动检查并设置 batch_size（如果为 None）
    if trainer_dataloader.batch_size is None:
        trainer_dataloader = DataLoader(
            dataset=trainer_dataloader.dataset,
            batch_size=1,  # 修改为 batch_size=1，避免因无 pad_token 引发错误
            shuffle=False,
            collate_fn=trainer_dataloader.collate_fn  # 保留原来的 collate_fn
        )

    # 使用自定义包装类 DataLoaderWithTaskname
    eval_dataloader = DataLoaderWithTaskname(task_name, trainer_dataloader)

    # 打印数据加载器信息（用于调试）
    print(f"Eval DataLoader for task '{task_name}' created with batch_size={eval_dataloader.batch_size}")
    print(f"Collate function: {eval_dataloader.data_loader.collate_fn}")

    # 使用封装后的 DataLoader 进行预测
    preds_dict[task_name] = trainer.prediction_loop(
        eval_dataloader,
        description=f"Validation: {task_name}",
    )

    # 调整预测结果的形状
    for key, value in preds_dict[task_name].items():
        if isinstance(value, torch.Tensor):
            preds_dict[task_name][key] = adjust_tensor_shape(value, (32, 32, 50257))

# 打印预测结果
print(preds_dict)


Not all data has been set. Are you sure you passed all values?


AttributeError: 'EvalLoopOutput' object has no attribute 'items'

In [None]:
from sklearn.metrics import accuracy_score, label_ranking_average_precision_score

accuracy_dict = {}
mrr_dict = {}

for task_name in ["token", "token_type", "line"]:
  accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),
    preds_dict[task_name].label_ids)
  
  mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),
    preds_dict[task_name].label_ids)
  