### 1. Settings

In [10]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 4, 5 , 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [6]:
dataset = load_dataset("neulab/conala")

train_data = pr.preprocess_dataset(dataset["train"], tokenizer=tokenizer)

test_data = pr.preprocess_dataset(dataset["test"], tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [12]:
results = {}
latest_run_epoch = 0

for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    print(f"TRAINING EPOCH SET {epoch_set}")

    TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
    MODEL_PATH = f"./models/{epoch_set}_epoch_set"
    

    results[epoch_set] = {}

    if epoch_set > 1: 
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
    else:
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
    
    print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

    if epoch_set > 1: 
        model_name = f"./models/{latest_run_epoch}_epoch_set"

    print(f"LOADING MODEL {model_name}")

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    print(device)
    model.to(device)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    compute_metrics = ev.compute_metric_with_params(tokenizer) 

    if not os.path.exists(f'reports/'): 
        os.mkdir(f'reports/')

    training_args = Seq2SeqTrainingArguments(
            **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
        )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    if epoch_set!=0:
        trainer.train()

    text = list(test_df["input_sequence"].values)
    summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
    
    test_df["prediction"] = summaries[1]
    test_df["rouge"] = rouge.compute(predictions=test_df["prediction"], 
                references=test_df["output_sequence"],
                use_stemmer=True, 
                use_aggregator=False,
                rouge_types=["rouge1"])["rouge1"]
    
    results[epoch_set] = test_df
    
    ########## SAVE EPOCH SET MODEL
    if not os.path.exists(MODEL_PATH): 
        os.mkdir(MODEL_PATH)

    trainer.save_model(MODEL_PATH)

    latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(results.keys()): 
    
    e_df = results[epoch_set]   
    e_df['epoch_set'] = epoch_set

    if epoch_i==0: 
        test_results_df = e_df.copy()
    else: 
        test_results_df = pd.concat([test_results_df, e_df])

########## SAVE THE FILE

with open('test_results_df.pickle', 'wb') as handle:
    pickle.dump(test_results_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5936,3.208277,0.4633,0.2218,0.4228,0.4226,14.7883,0.2192,0.9619,0.9626,5821,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 4
TRAINING EPOCHS 3
LOADING MODEL ./models/1_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8575,3.459214,0.4108,0.1752,0.3697,0.3699,14.8805,0.1697,0.9545,0.9555,5778,6047
2,2.7097,3.388699,0.4359,0.1898,0.3931,0.3931,14.87,0.1926,0.9559,0.9568,5786,6047
3,2.0885,3.424679,0.4464,0.2015,0.4026,0.4021,15.1593,0.2093,0.9694,0.9699,5865,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 5
TRAINING EPOCHS 1
LOADING MODEL ./models/4_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.8982,3.565015,0.4314,0.1878,0.3871,0.3868,15.26,0.1988,0.9823,0.9825,5941,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 7
TRAINING EPOCHS 2
LOADING MODEL ./models/5_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.895,3.712929,0.4091,0.1674,0.3649,0.365,14.6457,0.1705,0.9044,0.9087,5495,6047
2,1.8364,3.656302,0.4344,0.2003,0.3929,0.3926,15.4801,0.2134,0.9945,0.9945,6014,6047


  return dynamo.is_compiling()


In [25]:
results[0]

Unnamed: 0,question_id,intent,output_sequence,input_sequence,input_ids,attention_mask,labels,prediction,rouge,epoch_set
0,15080500,How can I send a signal from a python program?,send a signal `signal.SIGUSR1` to the current ...,"os.kill(os.getpid(), signal.SIGUSR1)","[1, 538, 18, 16418, 12, 538, 18, 588, 6610, 93...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 4661, 279, 4277, 1375, 10420, 18, 18513, 2...",kill a process with id `123`,0.266667,7
1,3283984,Decode Hex String in Python 3,decode a hex string '4a4b4c' to UTF-8.,bytes.fromhex('4a4b4c').decode('utf-8'),"[1, 3890, 18, 2080, 7118, 2668, 24, 69, 24, 70...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 3922, 279, 3827, 533, 296, 24, 69, 24, 70,...",convert a hex string `4a4b4c` to a hex string,0.588235,7
2,3844801,check if all elements in a list are identical,check if all elements in list `myList` are ide...,all(x == myList[0] for x in myList),"[1, 454, 12, 92, 422, 3399, 682, 63, 20, 65, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1893, 309, 777, 2186, 316, 666, 1375, 4811...",check if all elements in list `myList` are ide...,1.000000,7
3,4302166,Format string dynamically,format number of spaces between strings `Pytho...,"print('%*s : %*s' % (20, 'Python', 20, 'Very G...","[1, 1188, 29909, 14, 87, 294, 738, 14, 87, 11,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 2139, 1300, 434, 7292, 3086, 2064, 1375, 1...",print a list `20` with multiple spaces,0.200000,7
4,2544710,How I can get rid of None values in dictionary?,get rid of None values in dictionary `kwargs`,"res = {k: v for k, v in list(kwargs.items()) i...","[1, 455, 273, 288, 79, 30, 331, 364, 417, 16, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 588, 10911, 434, 599, 924, 316, 3880, 1375...",Filter dictionary `res` to have items with val...,0.210526,7
...,...,...,...,...,...,...,...,...,...,...
472,30551576,Using Regular Expressions to extract specific ...,match urls whose domain doesn't start with `t`...,"re.findall('http://[^t][^s""]+\\.html', document)","[1, 266, 18, 4720, 454, 2668, 2505, 2207, 5969...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1916, 6903, 8272, 2461, 3302, 1404, 787, 5...",match regex pattern 'http://[^t][^s])' on stri...,0.363636,7
473,113534,Is there a function in Python to split a strin...,split a string `mystring` considering the spac...,"mystring.replace(' ', '! !').split('!')","[1, 4811, 1080, 18, 2079, 2668, 2265, 11817, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 4939, 279, 533, 1375, 4811, 1080, 68, 2445...",Split string `mystring` with comma and questio...,0.315789,7
474,5838735,Open file in Python,open file `path` with mode 'r',"open(path, 'r')","[1, 3190, 12, 803, 16, 296, 86, 6134, 2, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","[1, 3190, 585, 1375, 803, 68, 598, 1965, 296, ...",read file `path` using python,0.363636,7
475,36003967,Sum of multiple list of lists index wise,sum elements at the same index in list `data`,[[sum(item) for item in zip(*items)] for items...,"[1, 31128, 1364, 12, 1726, 13, 364, 761, 316, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1364, 2186, 622, 326, 1967, 770, 316, 666,...",create a list containing the sum of each value...,0.380952,7


In [20]:
########## ROUGE PER SETTING

print("Mean")
print(test_results_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(test_results_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0    0.434192
1    0.434192
4    0.434192
5    0.434192
7    0.434192
Name: rouge, dtype: float64
STD
epoch_set
0    0.206415
1    0.206415
4    0.206415
5    0.206415
7    0.206415
Name: rouge, dtype: float64


### Step 2. Learn performance

In [115]:
def step_two(X_train, y_train, X_val, y_val, model): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    y_pred = reg.predict(X_val)
    y_pred[y_pred<0] = 0

    mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
    rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
    return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [118]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}

cv_df["perf_hat"] = 0


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.483859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4870
[LightGBM] [Info] Number of data points in the train set: 7536, number of used features: 308
[LightGBM] [Info] Start training from score 0.406208
catboost
Learning rate set to 0.056334
0:	learn: 0.1895386	total: 6.88ms	remaining: 6.87s
1:	learn: 0.1884952	total: 11.2ms	remaining: 5.58s
2:	learn: 0.1877014	total: 14.6ms	remaining: 4.87s
3:	learn: 0.1870614	total: 17.9ms	remaining: 4.45s
4:	learn: 0.1862288	total: 20.6ms	remaining: 4.1s
5:	learn: 0.1855628	total: 23.3ms	remaining: 3.85s
6:	learn: 0.1848339	total: 25.7ms	remaining: 3.64s
7:	learn: 0.1842664	total: 28ms	remaining: 3.47s
8:	learn: 0.1837521	total: 30.4ms	remaining: 3.34s
9:	learn: 0.1832613	total: 32.7ms	remaining: 3.23s
10:	learn: 0.1828190	total: 35ms	r

In [None]:
cv_df.groupby("epoch_set")["perf_hat"].mean()

epoch_set
0    0.314135
1    0.439433
4    0.436539
5    0.428805
7    0.421914
Name: perf_hat, dtype: float64

In [None]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

lr
0.24043027634407277
0.18477271633002026


svm
0.1882577626093708
0.1489360229803929


lgbm
0.18690249770390857
0.14712223049703274


catboost
0.1827869320742425
0.14473383932142433




In [120]:
with open('cd_df_with_predictions.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)