In [42]:
%%capture
#%pip install protobuf==3.20.1
%pip install transformers[torch]
%pip install -q sentencepiece
%pip install datasets==2.13.1
%pip install evaluate
%pip install rouge_score

In [4]:
QPATH = "Quantlet/4-qode2desc"

In [5]:
import sys

IN_COLAB = "google.colab" in sys.modules

import os

if IN_COLAB:
    os.chdir(
        f"/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}"
    )
else:
    %load_ext lab_black

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [27]:
%%capture
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

import torch
import torch, gc
import nltk

nltk.download("punkt")

import importlib
import analysis_modules

importlib.reload(analysis_modules)

In [39]:
for MODE in ["no_context", "repo", "domain", "author"]:
    analysis_config = {
        "DATE": "20231027",
        "MODE": MODE,
        "model_name": "CodeT5",
        "encoder_max_length": 512,
        "decoder_max_length": 75,
        "random_state": 42,
        "learning_rate": 5e-4,
        "epochs": 15,
        "train_batch": 16,
        "eval_batch": 4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "logging_stes": 100,
        "save_total_lim": 1,
        "save_strategy": "steps",
        "label_smooting": 0.1,
        "predict_generate": True,
        "load_best_model_at_end": False,
        "evaluation_strategy": "epoch",
        "freeze": True,
    }
    if analysis_config["MODE"] == "domain":
        analysis_config[
            "train_data_path"
        ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/no_context/"
    else:
        analysis_config[
            "train_data_path"
        ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/{analysis_config['MODE']}/"

    analysis_config["train_data_name"] = (
        f"full_train_dataset_{analysis_config['DATE']}_sample0.json",
    )
    if analysis_config["MODE"] == "domain":
        analysis_config[
            "val_data_path"
        ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/no_context/"
    else:
        analysis_config[
            "val_data_path"
        ] = f"../../data/preprocessed/Quantlet/{analysis_config['DATE']}/{analysis_config['MODE']}/"
    analysis_config[
        "val_data_name"
    ] = f"test_dataset_{analysis_config['DATE']}_sample0.json"
    analysis_config["analysis_name"] = create_name(analysis_config)
    print(analysis_config["analysis_name"])

    if analysis_config["MODE"] == "domain":
        if analysis_config["model_name"] == "CodeT5":
            analysis_config[
                "model_name"
            ] = "../../data/pretrained/analysis_report_CodeT5-test-12-300-4-2023-09-26-v2/results/checkpoint-88488"
        if analysis_config["model_name"] == "CodeTrans":
            analysis_config[
                "model_name"
            ] = "../../data/pretrained/CodeTrans/results/checkpoint-12290"

    gc.collect()
    torch.cuda.empty_cache()

    trainer = analysis_modules.scs_analyze(**analysis_config)

    gc.collect()
    torch.cuda.empty_cache()

    logs = parse_logs(trainer).drop_duplicates()

    ANALYSIS_FOLDER=f'reports/analysis_report_{analysis_name}'

    logs.to_csv(f'{ANALYSIS_FOLDER}/logs.csv', index=False)

    print('Analysis finished')

CodeT5_repo_test_20231027
