In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainerCallback, TrainingArguments
from datasets import Dataset, load_dataset
from copy import deepcopy

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

RANDOM_STATE = 42

In [2]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [3]:
tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/276k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/148k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [4]:
# !pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('YOUR_HUGGINGFACE_TOKEN_HERE')"

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


-----
## Load Dataset
-----

In [5]:
train_df = pd.read_csv("/kaggle/input/wrong-binary-operator-positive-negative-samples/wrong_binary_operator_dataset_filtered_train.csv", sep="\t")
test_df = pd.read_csv("/kaggle/input/wrong-binary-operator-positive-negative-samples/wrong_binary_operator_dataset_filtered_test.csv", sep="\t")

In [6]:
train_df.sample(5)

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels
273862,cleaned/msm7x30-3.4.x-naa/drivers/net/wireless...,dev->phy.rev,==,0,<dependent type>,int,IF_STMT,COMPOUND_STMT,251,7,251,24,0
502623,cleaned/blensor/source/blender/bmesh/operators...,numcuts,*,2,int,int,BINARY_OPERATOR,FOR_STMT,551,18,551,29,1
286142,cleaned/dd-wrt/src/router/links/view.c,ln->st,||,sl,unsigned char *,int,WHILE_STMT,COMPOUND_STMT,674,11,674,23,1
974200,cleaned/nagios/cgi/histogram.c,(double)DRAWING_WIDTH,/,(double)total_buckets,double,double,PAREN_EXPR,CSTYLE_CAST_EXPR,1582,30,1582,75,0
299479,cleaned/embers_vt/build/scipy/scipy/special/ce...,2.0,+,x,double,double,BINARY_OPERATOR,BINARY_OPERATOR,869,9,869,16,1


In [7]:
test_df.sample(5)

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels
76215,cleaned/msm7x30-3.4.x-naa/fs/nfs/nfs4filelayout.c,status,!=,0,int,int,CALL_EXPR,COMPOUND_STMT,391,9,391,20,0
109466,cleaned/blensor/source/blender/windowmanager/i...,y,/,15,int,int,CALL_EXPR,COMPOUND_STMT,264,34,264,40,1
29452,cleaned/dd-wrt/src/router/samba3/source/web/st...,"cgi_variable(""nmbd_restart"")",&&,"cgi_variable(""all_restart"")",int,int,IF_STMT,COMPOUND_STMT,269,6,269,65,1
23873,cleaned/binutils-vc4/gprof/corefile.c,"strncmp(name,CONSTPROP_NAME,CONSTPROP_NAME_LEN)",!=,0,int,int,BINARY_OPERATOR,IF_STMT,407,11,407,66,1
67951,cleaned/opensips/modules/registrar/save.c,sock,==,0,struct socket_info *,struct socket_info *,IF_STMT,COMPOUND_STMT,152,6,152,13,0


In [8]:
train_df.shape, test_df.shape

((1000786, 13), (111200, 13))

In [9]:
train_df.isna().sum()

file            0
left            0
operator        0
right           0
type_left       0
type_right      0
parent          0
grandparent     0
start_line      0
start_column    0
end_line        0
end_column      0
labels          0
dtype: int64

In [10]:
train_df['full_text'] = train_df['left'] + tokenizer.sep_token + train_df['operator'] + tokenizer.sep_token + train_df['right'] + tokenizer.sep_token + train_df['type_left'] + tokenizer.sep_token + train_df['type_right'] + tokenizer.sep_token + train_df['parent'] + tokenizer.sep_token + train_df['grandparent']
test_df['full_text'] = test_df['left'] + tokenizer.sep_token + test_df['operator'] + tokenizer.sep_token + test_df['right'] + tokenizer.sep_token + test_df['type_left'] + tokenizer.sep_token + test_df['type_right'] + tokenizer.sep_token + test_df['parent'] + tokenizer.sep_token + test_df['grandparent']

In [11]:
train_df.drop(['left','operator','right','type_left','type_right','parent', 'grandparent'], axis=1, inplace=True)
test_df.drop(['left','operator','right','type_left','type_right','parent', 'grandparent'], axis=1, inplace=True)

In [12]:
train_df.sample(5)

Unnamed: 0,file,start_line,start_column,end_line,end_column,labels,full_text
446133,cleaned/gtk-gnutella/src/lib/float-test.c,267,26,267,45,0,"strcmp(s,buf)</s>==</s>0</s>int</s>int</s>BINA..."
201545,cleaned/postgres/src/backend/parser/parse_agg.c,827,40,827,66,1,context->sublevels_up</s>!=</s>0</s>int</s>int...
278147,cleaned/feosmusic/codecs/aac/source/sbrhfadj.c,138,22,138,30,0,m</s><</s>mEnd</s>int</s>int</s>FOR_STMT</s>CO...
120604,cleaned/geda-gaf/libgeda/src/m_basic.c,172,12,172,42,1,(float)width</s>-</s>(float)height</s>float</s...
47043,cleaned/msm7x30-3.4.x-naa/drivers/video/sh_mob...,885,6,885,24,1,found->xres</s>>=</s>640</s><dependent type></...


In [13]:
test_df.sample(5)

Unnamed: 0,file,start_line,start_column,end_line,end_column,labels,full_text
78363,cleaned/drone/lib/ffmpeg/libavcodec/faad.c,159,13,159,42,1,buf_size</s>!=</s>(int)bytesconsumed</s>int</s...
9386,cleaned/playbook-dev-tools/bootstrap/gcc/gcc/g...,7,33,7,43,1,n</s>^</s>0xffff</s>unsigned long</s>unsigned ...
41227,cleaned/playbook-dev-tools/bootstrap/gcc/gcc/g...,6,11,6,17,0,x</s>!=</s>0</s>int</s>int</s>PAREN_EXPR</s>RE...
91450,cleaned/krb5/src/lib/krb5/asn.1/asn1_encode.c,742,31,742,67,0,(constchar*)val</s>+</s>counted->dataoff</s>co...
12556,cleaned/libhybris/hybris/common/gingerbread/li...,555,41,555,49,1,c</s>=</s>'i'</s>int</s>int</s>BINARY_OPERATOR...


In [14]:
print(train_df.iloc[10].full_text)
print("-" * 80)
print(tokenizer.tokenize(train_df.iloc[10].full_text))
print("-" * 80)
print(tokenizer(train_df.iloc[10].full_text))

usec</s>-</s>1000</s>unsigned long</s>unsigned long</s>BINARY_OPERATOR</s>COMPOUND_STMT
--------------------------------------------------------------------------------
['usec', '</s>', '-', '</s>', '1000', '</s>', 'unsigned', 'Ġlong', '</s>', 'unsigned', 'Ġlong', '</s>', 'BINARY', '_', 'OPER', 'ATOR', '</s>', 'COMP', 'OUND', '_', 'STMT']
--------------------------------------------------------------------------------
{'input_ids': [1, 7380, 2, 117, 2, 2369, 2, 688, 1018, 2, 688, 1018, 2, 11022, 167, 3531, 5747, 2, 2322, 17844, 167, 13997, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
def tokenize_text(examples):
    return tokenizer(examples["full_text"], truncation=True, max_length=128, padding=True)

In [16]:
train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=RANDOM_STATE)
train_df.shape, validation_df.shape

((900707, 7), (100079, 7))

In [17]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text', '__index_level_0__'],
    num_rows: 900707
})

In [18]:
validation_dataset = Dataset.from_pandas(validation_df)
validation_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text', '__index_level_0__'],
    num_rows: 100079
})

In [19]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text'],
    num_rows: 111200
})

-----
## Tokenization
-----

In [20]:
train_dataset = train_dataset.map(tokenize_text, batched=True, remove_columns=["full_text", "__index_level_0__"])
train_dataset

  0%|          | 0/901 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 900707
})

In [21]:
validation_dataset = validation_dataset.map(tokenize_text, batched=True, remove_columns=["full_text", "__index_level_0__"])
validation_dataset

  0%|          | 0/101 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 100079
})

In [22]:
test_dataset = test_dataset.map(tokenize_text, batched=True, remove_columns=["full_text"])
test_dataset

  0%|          | 0/112 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 111200
})

In [23]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

-----
## Model Training
-----

In [24]:
# ! rm -rd /kaggle/working/codeT5-DistilBERT-wrong-binary-operator-bug-model

batch_size = 32
logging_steps = len(train_dataset) // batch_size
output_dir = "codeT5-DistilBERT-wrong-binary-operator-bug-model"

training_args = TrainingArguments(output_dir,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  save_strategy="epoch",
                                  # save_steps=10000,
                                  # fp16=True,
                                  push_to_hub=True,
                                )

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            self._trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
            return control_copy

In [26]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.add_callback(CustomCallback(trainer)) 

Cloning https://huggingface.co/dipudl/codeT5-DistilBERT-wrong-binary-operator-bug-model into local empty directory.


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: file, start_line, end_column, end_line, start_column. If file, start_line, end_column, end_line, start_column are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 900707
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 140740


Epoch,Training Loss,Validation Loss


In [None]:
# trainer.save("codeT5-DistilBERT-wrong-binary-operator-bug-model_10ep")