In [1]:
%%capture
#%pip install protobuf==3.20.1
%pip install transformers[torch]
%pip install -q sentencepiece
%pip install datasets==2.13.1
%pip install evaluate
%pip install rouge_score

In [2]:
QPATH = "Quantlet/4-qode2desc"

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    %load_ext lab_black

In [5]:
%%capture
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

import torch
import torch, gc
import nltk

nltk.download("punkt")

import importlib
import analysis_modules

importlib.reload(analysis_modules)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
def create_name(analysis_config):
    name = analysis_config["model_name"]
    if "checkpoint" in name:
        name = name.split("/")[-1]
    mode = analysis_config["MODE"]
    date = analysis_config["DATE"]
    if analysis_config["val_data_name"].startswith("val"):
        sample = "val"
    else:
        sample = "test"
    return f"{name}_{mode}_{sample}_{date}"

In [34]:
analysis_config = {
    "DATE": "20231024_random",
    "MODE": "no_context",
    "model_name": "CodeT5",
    "encoder_max_length": 512,
    "decoder_max_length": 75,
    "random_state": 42,
    "learning_rate": 5e-4,
    "epochs": 4,
    "train_batch": 16,
    "eval_batch": 4,
    "warmup_steps": 100,
    "weight_decay": 0.1,
    "logging_stes": 100,
    "save_total_lim": 1,
    "save_strategy": "steps",
    "label_smooting": 0.1,
    "predict_generate": True,
    "load_best_model_at_end": False,
    "evaluation_strategy": "epoch",
}
if analysis_config["MODE"] == "domain":
    analysis_config[
        "train_data_path"
    ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/no_context/"
else:
    analysis_config[
        "train_data_path"
    ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/{analysis_config['MODE']}/"

analysis_config["train_data_name"] = (
    f"full_train_dataset_{analysis_config['DATE']}_sample0.json",
)
if analysis_config["MODE"] == "domain":
    analysis_config[
        "val_data_path"
    ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/no_context/"
else:
    analysis_config[
        "val_data_path"
    ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/{analysis_config['MODE']}/"
analysis_config[
    "val_data_name"
] = f"test_dataset_{analysis_config['DATE']}_sample0.json"
analysis_config["analysis_name"] = create_name(analysis_config)
print(analysis_config["analysis_name"])

if analysis_config["MODE"] == "domain":
    if analysis_config["model_name"] == "CodeT5":
        analysis_config[
            "model_name"
        ] = "../../data/pretrained/analysis_report_CodeT5-test-12-300-4-2023-09-26-v2/results/checkpoint-88488"
    if analysis_config["model_name"] == "CodeTrans":
        analysis_config[
            "model_name"
        ] = "../../data/pretrained/CodeTrans/results/checkpoint-12290"

CodeT5_no_context_test_20231024_random


In [35]:
gc.collect()
torch.cuda.empty_cache()

In [36]:
trainer = analysis_modules.scs_analyze(**analysis_config)

CodeT5_no_context_test_20231024_random
cuda
cuda




  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


   eval_loss  eval_rouge1  eval_rouge2  eval_rougeL  eval_rougeLsum  \
0      6.186         0.15        0.044        0.129           0.134   

   eval_bleu  eval_gen_len  
0      0.007        12.373  




Epoch,Training Loss,Validation Loss


In [20]:
gc.collect()
torch.cuda.empty_cache()

In [21]:
def parse_logs(trainer):
    log_history = trainer.state.log_history
    train_log = pd.DataFrame(columns=log_history[0].keys())
    eval_log = pd.DataFrame(columns=log_history[1].keys())
    for log in log_history:
        if "loss" in log:
            train_log = pd.concat(
                [train_log, pd.DataFrame.from_dict(log, orient="index").T], axis=0
            )
        elif "eval_loss" in log:
            eval_log = pd.concat(
                [eval_log, pd.DataFrame.from_dict(log, orient="index").T], axis=0
            )

    logs = train_log.merge(
        eval_log,
        how="inner",
        left_on=["epoch", "step"],
        right_on=["epoch", "step"],
    )
    return logs[
        [
            "epoch",
            "loss",
            "step",
            "eval_loss",
            "eval_rouge1",
            "eval_rouge2",
            "eval_rougeL",
            "eval_rougeLsum",
            "eval_gen_len",
            "eval_bleu",
            "eval_brevity_penalty",
            "eval_length_ratio",
            "eval_translation_length",
            "eval_reference_length",
        ]
    ]

In [22]:
logs = parse_logs(trainer).drop_duplicates()

In [23]:
logs

Unnamed: 0,epoch,loss,step,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum,eval_gen_len,eval_bleu,eval_brevity_penalty,eval_length_ratio,eval_translation_length,eval_reference_length
0,1.0,5.3125,272.0,4.558359,0.2856,0.1107,0.2417,0.2538,17.5673,0.0354,0.3345,0.4773,6239.0,13072.0
1,2.0,4.3069,544.0,3.833143,0.3162,0.1467,0.2741,0.2849,17.5652,0.055,0.3184,0.4663,6096.0,13072.0
2,3.0,3.6604,816.0,3.497112,0.3382,0.172,0.2997,0.3069,17.94,0.07,0.3386,0.4801,6276.0,13072.0
3,4.0,3.3564,1088.0,3.403933,0.3462,0.1828,0.3082,0.3167,17.9938,0.075,0.3356,0.478,6249.0,13072.0
