In [1]:
from abc import ABC
from pathlib import Path

import pandas as pd
from codetf.models import load_model_pipeline
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root_dir = Path.cwd()

# Create Predictions
Im folgenden wird mit allen Models für den Testdatensatz die Predictions generiert.

In [8]:
class AbstractModel(ABC):
    def predict(self, code: str) -> str:
        raise NotImplementedError()
    
    def model_name(self) -> str:
        raise NotImplementedError()


class CodeTFModel(AbstractModel):
    def __init__(self, model_name: str, model_type: str, task: str) -> None:
        super().__init__()

        self._model = load_model_pipeline(model_name=model_name, model_type=model_type, task=task)
        self._model_name = model_name
        self._model_type = model_type
        self._task = task

    def predict(self, code: str) -> str:
        return self._model.predict([code])[0]
    
    def model_name(self) -> str:
        return f"{self._model_name}-{self._model_type}-{self._task}"


class SebisModel(AbstractModel):
    def __init__(self, model_name: str) -> None:
        super().__init__()

        self._pipeline = SummarizationPipeline(
            model=AutoModelWithLMHead.from_pretrained(model_name),
            tokenizer=AutoTokenizer.from_pretrained(model_name, skip_special_tokens=True),
            device=0
        )
        self._model_name = model_name

    def predict(self, code: str) -> str:
        return self._pipeline([code])[0]["summary_text"]
    
    def model_name(self) -> str:
        return self._model_name.replace("/", "-")


class CodeT5PModel(AbstractModel):
    def __init__(self) -> None:
        super().__init__()

        self._pipeline = SummarizationPipeline(
            model=AutoModelWithLMHead.from_pretrained(root_dir.parent / "modeling" / "models" / "codet5p_220m"),
            tokenizer=AutoTokenizer.from_pretrained("Salesforce/codet5p-220m"),
            device=0
        )
        self._model_name = "codet5p_220m"

    def predict(self, code: str) -> str:
        return self._pipeline([code])[0]["summary_text"]
    
    def model_name(self) -> str:
        return self._model_name.replace("/", "-")

In [4]:
def get_preds(df: pd.DataFrame, model: AbstractModel):
    file_path = root_dir / "data" / "preds" / f"{model.model_name()}.csv"
    
    if file_path.exists():
        return

    df = df.copy()
    df["pred"] = df["code"].map(model.predict)
    df[["ref", "pred"]].to_csv(file_path)

In [5]:
dataset = load_dataset("json", data_files={
    "test": str(root_dir.parent / "data" / "test.jsonl"),
}, cache_dir=root_dir.parent / "data" / "cache")

Found cached dataset json (/mnt/batch/tasks/shared/LS_root/mounts/clusters/mdl/code/Users/Paul.Brauckmann/mdl-ii/src/data/cache/json/default-55e4ec2f4ae22eff/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


In [6]:
def inference(doc):
    doc["ref"] = [" ".join(docstring) for docstring in doc["docstring_tokens"]]
    return doc    

dataset = dataset.map(inference, batched=True)
dataset.set_format(type="pandas", columns=["ref", "code"])
df = dataset["test"][:]
df.head()

Loading cached processed dataset at /mnt/batch/tasks/shared/LS_root/mounts/clusters/mdl/code/Users/Paul.Brauckmann/mdl-ii/src/data/cache/json/default-55e4ec2f4ae22eff/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-fe04db097f4167f6.arrow


Unnamed: 0,code,ref
0,def sina_xml_to_url_list(xml_data):\n rawur...,str - > list Convert XML to URL List . From Bi...
1,"def dailymotion_download(url, output_dir='.', ...",Downloads Dailymotion videos by URL .
2,"def sina_download(url, output_dir='.', merge=T...",Downloads Sina videos by URL .
3,"def sprint(text, *colors):\n return ""\33[{}...",Format text with color or other effects into A...
4,"def print_log(text, *colors):\n sys.stderr....",Print a log message to standard error .


## CodeT5 Base

In [7]:
get_preds(df, CodeTFModel(model_name="codet5", model_type="base-multi-sum", task="pretrained"))

In [8]:
get_preds(df, CodeTFModel(model_name="codet5", model_type="base", task="sum_python"))

## T5 Small

In [9]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_small_code_documentation_generation_python"))



In [10]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_small_code_documentation_generation_python_transfer_learning_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 641/641 [00:00<00:00, 3.57MB/s]
Downloading pytorch_model.bin: 100%|██████████| 242M/242M [00:02<00:00, 85.9MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 147kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 90.4MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 11.6MB/s]


In [11]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_small_code_documentation_generation_python_multitask"))

In [12]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_small_code_documentation_generation_python_multitask_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 641/641 [00:00<00:00, 4.10MB/s]
Downloading pytorch_model.bin: 100%|██████████| 242M/242M [00:03<00:00, 65.2MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 165kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 86.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 13.1MB/s]


## T5 Base

In [13]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_base_code_documentation_generation_python"))

Downloading (…)lve/main/config.json: 100%|██████████| 644/644 [00:00<00:00, 4.14MB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:15<00:00, 57.5MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 160kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 87.9MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 12.5MB/s]


In [14]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_base_code_documentation_generation_python_transfer_learning_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 644/644 [00:00<00:00, 4.15MB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:09<00:00, 96.5MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 162kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 72.0MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 12.9MB/s]


In [15]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_base_code_documentation_generation_python_multitask"))

Downloading (…)lve/main/config.json: 100%|██████████| 644/644 [00:00<00:00, 3.33MB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:10<00:00, 84.7MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 146kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 76.9MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 12.5MB/s]


In [16]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_base_code_documentation_generation_python_multitask_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 644/644 [00:00<00:00, 4.19MB/s]
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:11<00:00, 81.0MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 165kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 83.1MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 12.1MB/s]


## T5 Large

In [17]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_large_code_documentation_generation_python_transfer_learning_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 645/645 [00:00<00:00, 3.11MB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.95G/2.95G [00:42<00:00, 68.6MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 85.9kB/s]
Downloading spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 155MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 10.2MB/s]


In [18]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_large_code_documentation_generation_python_multitask"))

Downloading (…)lve/main/config.json: 100%|██████████| 645/645 [00:00<00:00, 4.29MB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.95G/2.95G [00:42<00:00, 69.5MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 137kB/s]
Downloading spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 3.85MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 11.6MB/s]
Bad pipe message: %s [b'\xb7\x8d\xd5\x1ac\xc4U\xbem\xc0\x95', b'\x0f\x7fn\xa3\x0c W\xf3vwf\xd12\xd0\x89Rig\x04\xc8M\xe7\x8e\xa7B\x8d;>Q\x15\xca\xd3>\xc6\x8f.Z\xb5\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00']
Bad pipe message

In [19]:
get_preds(df, SebisModel("SEBIS/code_trans_t5_large_code_documentation_generation_python_multitask_finetune"))

Downloading (…)lve/main/config.json: 100%|██████████| 645/645 [00:00<00:00, 4.16MB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.95G/2.95G [00:52<00:00, 56.6MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 24.0/24.0 [00:00<00:00, 135kB/s]
Downloading spiece.model: 100%|██████████| 797k/797k [00:00<00:00, 3.84MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 1.79k/1.79k [00:00<00:00, 3.73MB/s]


## Fine Tuned Model

In [9]:
get_preds(df, CodeT5PModel())

Token indices sequence length is longer than the specified maximum sequence length for this model (730 > 512). Running this sequence through the model will result in indexing errors


# Auswertung

In [10]:
def get_scores(df: pd.DataFrame, model_name: str):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
    scores = [scorer.score(ref, hyp) for ref, hyp in zip(df["ref"].to_list(), df["pred"].to_list())]
    rouge1 = sum([score["rouge1"].fmeasure for score in scores]) / len(scores)
    rougeL = sum([score["rougeL"].fmeasure for score in scores]) / len(scores)
    # meteor = sum([meteor_score([ref], hyp) for ref, hyp in zip(df["ref"].to_list(), df["pred"].to_list())]) / len(df["pred"].to_list())

    return pd.DataFrame({
        "bleu": corpus_bleu(df["pred"].to_list(), [df["ref"].to_list()]).score,
        "chrf": corpus_chrf(df["pred"].to_list(), [df["ref"].to_list()]).score,
        "ter": corpus_ter(df["pred"].to_list(), [df["ref"].to_list()]).score,
        "rouge1": rouge1, "rougeL": rougeL
    }, index=pd.Index([model_name], name="Model"))

In [13]:
df = pd.DataFrame(columns=["bleu", "chrf", "ter"])

for csv_file in sorted((root_dir / "data" / "preds").glob("*.csv"), key=lambda f: f.name):
    df = pd.concat([df, get_scores(pd.read_csv(csv_file, index_col=0), csv_file.name)])

df



Unnamed: 0,bleu,chrf,ter,rouge1,rougeL
SEBIS-code_trans_t5_base_code_documentation_generation_python.csv,4.637784,23.193613,102.020691,0.263155,0.233785
SEBIS-code_trans_t5_base_code_documentation_generation_python_multitask.csv,2.956659,15.565684,93.159572,0.220844,0.208371
SEBIS-code_trans_t5_base_code_documentation_generation_python_multitask_finetune.csv,13.765888,33.452465,78.74155,0.442934,0.410086
SEBIS-code_trans_t5_base_code_documentation_generation_python_transfer_learning_finetune.csv,21.670814,37.954286,71.5601,0.485289,0.457306
SEBIS-code_trans_t5_large_code_documentation_generation_python_multitask.csv,13.487266,32.52696,79.615003,0.433306,0.401295
SEBIS-code_trans_t5_large_code_documentation_generation_python_multitask_finetune.csv,16.361668,35.033486,80.671383,0.44458,0.412247
SEBIS-code_trans_t5_large_code_documentation_generation_python_transfer_learning_finetune.csv,23.306288,38.984149,69.744748,0.497358,0.470244
SEBIS-code_trans_t5_small_code_documentation_generation_python.csv,5.495179,25.007326,103.980811,0.286502,0.255965
SEBIS-code_trans_t5_small_code_documentation_generation_python_multitask.csv,5.449546,20.280297,89.909868,0.29519,0.276226
SEBIS-code_trans_t5_small_code_documentation_generation_python_multitask_finetune.csv,16.378271,34.69157,76.738425,0.452363,0.42135
