In [112]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainerCallback, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, load_dataset
from copy import deepcopy
import torch

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

RANDOM_STATE = 42

In [184]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [185]:
tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
# model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
#                                                            num_labels=3,
#                                                            id2label=id2label,
#                                                            label2id=label2id)
model = AutoModelForSequenceClassification.from_pretrained('dipudl/codeT5-DistilBERT-wrong-binary-operator-bug-model',
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

loading file https://huggingface.co/dipudl/codet5-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/ed7e409e6ad247e09a2ca8a19dc9e58d74cb30795f408bcb716cb6756f54b76d.9a35ae57ce66b3a375abfa9a6a2fa53dcd2cd361db8e6478877f08569f69b771
loading file https://huggingface.co/dipudl/codet5-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/1f727ef4e0afebcfcf86fe431abd086875e1d80f1da0e5fc0cc33e4077faf0a9.1c2821a0b9a2f62bbeacf7ba0b4c2b2b4dd6f63645fe6681015af90a376d90a1
loading file https://huggingface.co/dipudl/codet5-base/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/792db69f7307dc6c44be61f902366e9c9ca3d6f8daa189f150e4a407f15a0db0.e2e19d1cfdd164eb1be4cf51c10f0fd5abbc13d2d1ab3839ca28f7dc2e392a87
loading file https://huggingface.co/dipudl/codet5-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/dipudl/codet5-base/resolve/main/special_tokens_map.json from cache at /r

In [186]:
input_ids = torch.tensor(tokenizer("printf").input_ids)
input_ids

tensor([  1, 805,   2])

In [197]:
eb = model.distilbert.embeddings.word_embeddings(input_ids)

In [188]:
pe = model.distilbert.embeddings.position_embeddings(torch.tensor(range(len(input_ids))))

In [189]:
norm = model.distilbert.embeddings.LayerNorm(0.9 * eb[1] + 0.1 * pe[1])

In [190]:
do = model.distilbert.embeddings.dropout(norm)

In [192]:
do_tf = model.distilbert.transformer.layer[0].attention.dropout(do)

In [193]:
q = model.distilbert.transformer.layer[0].attention.q_lin(do_tf)
k = model.distilbert.transformer.layer[0].attention.k_lin(do_tf)
v = model.distilbert.transformer.layer[0].attention.v_lin(do_tf)

In [195]:
q

tensor([ 4.7106e-01,  3.0568e-01,  3.2355e-01, -7.8976e-01, -1.3491e+00,
         7.0408e-01, -6.6028e-01, -2.1306e-01,  4.2756e-02,  1.6696e-01,
        -3.5422e-01, -7.7544e-01,  1.8192e-01, -1.8540e-01,  6.6779e-01,
        -4.9712e-01, -1.0806e-02,  4.5290e-01,  3.8951e-01,  8.2806e-01,
         3.1588e-01,  3.2408e-01,  1.0106e+00, -3.1004e-01,  1.4348e+00,
         3.9465e-01, -1.1275e-01, -3.3617e-02,  6.0866e-01, -2.0182e-01,
         1.1027e+00,  7.9142e-01,  2.7995e-01,  3.0194e-01, -1.4521e-01,
         2.1563e-01, -3.4922e-01,  4.4477e-01, -3.5083e-01, -1.1443e-01,
         5.0118e-01,  8.7513e-01, -1.1783e+00,  8.1201e-01, -9.6401e-01,
         5.6507e-01,  2.2717e-01,  5.1867e-01, -4.0802e-01, -3.0123e-02,
         1.7403e-01,  1.0863e+00,  1.8856e+00, -5.3265e-01,  4.4486e-01,
         1.2276e-04,  9.1102e-02,  2.4947e-01,  7.7999e-01,  9.6519e-01,
         1.0176e+00,  2.6061e-01, -9.3483e-01,  4.4609e-01,  1.5797e+00,
        -2.1685e-01,  3.5906e-01,  5.5714e-01,  6.9

In [151]:
model.distilbert.transformer.layer[0].attention

MultiHeadSelfAttention(
  (dropout): Dropout(p=0.1, inplace=False)
  (q_lin): Linear(in_features=768, out_features=768, bias=True)
  (k_lin): Linear(in_features=768, out_features=768, bias=True)
  (v_lin): Linear(in_features=768, out_features=768, bias=True)
  (out_lin): Linear(in_features=768, out_features=768, bias=True)
)

In [14]:
# !pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


-----
## Load Dataset
-----

In [48]:
train_df = pd.read_csv("/kaggle/input/wrong-binary-operator-dataset-train-test/wrong_binary_operator_dataset_filtered_train.csv", sep="\t")
test_df = pd.read_csv("/kaggle/input/wrong-binary-operator-dataset-train-test/wrong_binary_operator_dataset_filtered_test.csv", sep="\t")

In [49]:
train_df.sample(5)

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels
707482,cleaned/msm7x30-3.4.x-naa/fs/ceph/snap.c,new->ino,>=,r->ino,<dependent type>,<dependent type>,IF_STMT,COMPOUND_STMT,92,7,92,24,1
403443,cleaned/mosync/examples/cpp/wolf3d/wl_main.c,width,>,20,int,int,IF_STMT,COMPOUND_STMT,924,6,924,16,0
7295,cleaned/cvxopt/src/C/dsdp.c,n,/,2,int,int,UNEXPOSED_EXPR,CALL_EXPR,251,38,251,41,1
800096,cleaned/openldap/servers/slapd/back-perl/compa...,avalen,+,1,int,int,CALL_EXPR,UNEXPOSED_EXPR,39,22,39,32,0
533127,cleaned/linux-sensor/src/ringbuf.c,back,-,1,unsigned int,unsigned int,CALL_EXPR,CSTYLE_CAST_EXPR,25,56,25,64,1


In [50]:
test_df.sample(5)

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels
36832,cleaned/msm7x30-3.4.x-naa/drivers/scsi/bfa/bfa...,qe,!=,qh,struct list_head *,struct list_head *,PAREN_EXPR,BINARY_OPERATOR,4865,10,4865,18,0
92107,cleaned/bundler/lib/matrix/svd.c,j,*=,n,int,int,BINARY_OPERATOR,BINARY_OPERATOR,370,27,370,30,1
59440,cleaned/git-cheetah/finder/plugin.c,me->num_ref,>,0,<dependent type>,int,IF_STMT,COMPOUND_STMT,95,6,95,21,0
109137,cleaned/freebsd/contrib/gcc/tree-ssa-loop-pref...,!READ_CAN_USE_WRITE_PREFETCH,||,!ref->write_p,int,<dependent type>,BINARY_OPERATOR,IF_STMT,675,11,676,20,1
40563,cleaned/penguinspuzzle/matrix.c,-Q,+,near,float,float,BINARY_OPERATOR,COMPOUND_STMT,231,10,231,17,1


In [51]:
train_df.shape, test_df.shape

((1000786, 13), (111200, 13))

In [52]:
train_df.isna().sum()

file            0
left            0
operator        0
right           0
type_left       0
type_right      0
parent          0
grandparent     0
start_line      0
start_column    0
end_line        0
end_column      0
labels          0
dtype: int64

In [53]:
train_df['full_text'] = train_df['left'] + tokenizer.sep_token + train_df['operator'] + tokenizer.sep_token + train_df['right'] + tokenizer.sep_token + train_df['type_left'] + tokenizer.sep_token + train_df['type_right'] + tokenizer.sep_token + train_df['parent'] + tokenizer.sep_token + train_df['grandparent']
test_df['full_text'] = test_df['left'] + tokenizer.sep_token + test_df['operator'] + tokenizer.sep_token + test_df['right'] + tokenizer.sep_token + test_df['type_left'] + tokenizer.sep_token + test_df['type_right'] + tokenizer.sep_token + test_df['parent'] + tokenizer.sep_token + test_df['grandparent']

In [54]:
train_df.drop(['left','operator','right','type_left','type_right','parent', 'grandparent'], axis=1, inplace=True)
test_df.drop(['left','operator','right','type_left','type_right','parent', 'grandparent'], axis=1, inplace=True)

In [55]:
train_df.sample(5)

Unnamed: 0,file,start_line,start_column,end_line,end_column,labels,full_text
434626,cleaned/playbook-dev-tools/bootstrap/gcc/gcc/g...,74,15,74,33,1,i</s><=</s>ARRAY_SIZE(v)</s>int</s>int</s>FOR_...
309648,cleaned/gempak/gempak/source/programs/gui/nmap...,5251,12,5251,22,0,hhmm</s>%</s>100</s>int</s>int</s>BINARY_OPERA...
112078,cleaned/virtuoso-opensource/libsrc/Wi/sparql_s...,1039,11,1039,20,1,'%'</s><</s>c2</s>int</s>int</s>IF_STMT</s>COM...
113956,cleaned/torque/src/lib/Libifl/enc_JobFile.c,115,13,115,40,0,"diswcs(chan,buf,len)</s>!=</s>0</s>int</s>int<..."
739784,cleaned/msm7x30-3.4.x-naa/fs/buffer.c,1894,9,1894,27,1,block_start</s>>=</s>from</s>unsigned int</s>u...


In [56]:
test_df.sample(5)

Unnamed: 0,file,start_line,start_column,end_line,end_column,labels,full_text
101025,cleaned/asuswrt-merlin/release/src-rt-6.x.4708...,259,8,259,19,1,count</s>*</s>len</s>int</s>int</s>PAREN_EXPR<...
26903,cleaned/presto/src/prepfold_utils.c,277,35,277,68,1,"sscanf(line,""%lf"",&ts[nn])</s>!=</s>1</s>int</..."
58840,cleaned/dd-wrt/src/router/php5/Zend/zend_highl...,131,7,131,31,1,last_color</s>></s>next_color</s>char *</s>cha...
104680,cleaned/gles2framework/examples/invaders.c,417,37,417,70,1,aliens[n].explosion->vel[i*3]</s>-</s>t</s><de...
79986,cleaned/pyclaw/src/pyclaw/limiters/weno/recons...,4766,2,4766,39,1,-0.254365079365079</s>-</s>f[(i+1)*fsi]</s>dou...


In [57]:
# FOR TESTING PURPOSE
train_df = train_df.sample(int(train_df.shape[0] * 0.01))
test_df = test_df.sample(int(test_df.shape[0] * 0.01))

In [58]:
print(train_df.iloc[10].full_text)
print("-" * 80)
print(tokenizer.tokenize(train_df.iloc[10].full_text))
print("-" * 80)
print(tokenizer(train_df.iloc[10].full_text))

dsc</s>|</s>dsc->dbus_signal</s>const struct wpa_dbus_signal_desc *</s><dependent type></s>FOR_STMT</s>COMPOUND_STMT
--------------------------------------------------------------------------------
['dsc', '</s>', '|', '</s>', 'dsc', '->', 'dbus', '_', 'signal', '</s>', 'const', 'Ġstruct', 'Ġwpa', '_', 'dbus', '_', 'signal', '_', 'desc', 'Ġ*', '</s>', '<', 'dependent', 'Ġtype', '>', '</s>', 'FOR', '_', 'STMT', '</s>', 'COMP', 'OUND', '_', 'STMT']
--------------------------------------------------------------------------------
{'input_ids': [1, 18262, 2, 196, 2, 18262, 378, 7971, 167, 3394, 2, 964, 696, 5456, 167, 7971, 167, 3394, 167, 1369, 366, 2, 132, 8773, 1263, 134, 2, 2083, 167, 13997, 2, 2322, 17844, 167, 13997, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [59]:
def tokenize_text(examples):
    return tokenizer(examples["full_text"], truncation=True, max_length=128, padding=True)

In [60]:
train_df, validation_df = train_test_split(train_df, test_size=0.1, random_state=RANDOM_STATE)
train_df.shape, validation_df.shape

((9006, 7), (1001, 7))

In [61]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text', '__index_level_0__'],
    num_rows: 9006
})

In [62]:
validation_dataset = Dataset.from_pandas(validation_df)
validation_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text', '__index_level_0__'],
    num_rows: 1001
})

In [63]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'full_text', '__index_level_0__'],
    num_rows: 1112
})

-----
## Tokenization
-----

In [64]:
train_dataset = train_dataset.map(tokenize_text, batched=True, remove_columns=["full_text", "__index_level_0__"])
train_dataset

  0%|          | 0/10 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 9006
})

In [65]:
validation_dataset = validation_dataset.map(tokenize_text, batched=True, remove_columns=["full_text", "__index_level_0__"])
validation_dataset

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 1001
})

In [66]:
test_dataset = test_dataset.map(tokenize_text, batched=True, remove_columns=["full_text"])
test_dataset

  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['file', 'start_line', 'start_column', 'end_line', 'end_column', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1112
})

In [67]:
def softmax(x):
    result = np.zeros_like(x)
    
    for i in range(len(x)):
        result[i] = np.exp(x[i]) / np.sum(np.exp(x[i]), axis=0)
    return result

In [68]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # loss = log_loss(labels, softmax(preds))
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

-----
## Model Training
-----

In [69]:
# ! rm -rd /kaggle/working/codeT5-DistilBERT-wrong-binary-operator-bug-model

batch_size = 64
logging_steps = len(train_dataset) // batch_size
output_dir = "codeT5-DistilBERT-wrong-binary-operator-bug-model"

training_args = TrainingArguments(output_dir,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  save_strategy="epoch",
                                  # save_steps=10000,
                                  # fp16=True,
                                  push_to_hub=False,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='eval_loss',
                                  greater_is_better=False
                                )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [71]:
class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer, test_dataset) -> None:
        super().__init__()
        self._trainer = trainer
        self.test_dataset = test_dataset

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            print("Epoch:", state.epoch)
            print(self._trainer.predict(self._trainer.train_dataset, metric_key_prefix="train").metrics)
            print(self._trainer.predict(self._trainer.eval_dataset, metric_key_prefix="eval").metrics)
            print(self._trainer.predict(self.test_dataset, metric_key_prefix="test").metrics)
            
            train = self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            self._trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
            return control_copy

In [72]:
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,      # number of evaluations to wait before stopping
    early_stopping_threshold=0.01,  # threshold for relative improvement in metric
)

In [29]:
learning_rates = [0.001, 0.0001, 0.00001, 0.000001]

# Loop over the learning rates
for lr in learning_rates:
    # Update the learning rate in the TrainingArguments
    training_args.learning_rate = lr

    # Create a new Trainer with the updated TrainingArguments
    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping]
    )

    # Train the model and evaluate it on the validation set
    trainer.train()
    print(trainer.evaluate())

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: end_line, start_line, start_column, end_column, file. If end_line, start_line, start_column, end_column, file are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 900707
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 70370


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 score
1,No log,0.69315,0.500595,0.0,0.0,0.0
2,0.693600,0.693243,0.499405,0.499405,1.0,0.666138


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: end_line, start_line, start_column, end_column, file. If end_line, start_line, start_column, end_column, file are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100079
  Batch size = 128
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to codeT5-DistilBERT-wrong-binary-operator-bug-model/checkpoint-7037
Configuration saved in codeT5-DistilBERT-wrong-binary-operator-bug-model/checkpoint-7037/config.json
Model weights saved in codeT5-DistilBERT-wrong-binary-operator-bug-model/checkpoint-7037/pytorch_model.bin
tokenizer config file saved in codeT5-DistilBERT-wrong-binary-operator-bug-model/checkpoint-7037/tokenizer_config.json
Special tokens file saved in codeT5-DistilBERT-wrong-binary-operator-bug-model/checkp

KeyboardInterrupt: 

In [None]:
run the code below for final model training [this is for safety as it throws error and prevents further execution of code]

In [77]:
# ! rm -rd /kaggle/working/codeT5-DistilBERT-wrong-binary-operator-bug-model

BEST_LEARNING_RATE = 2e-5
training_args.learning_rate = BEST_LEARNING_RATE
training_args.push_to_hub = False

trainer = CustomTrainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.add_callback(CustomCallback(trainer, test_dataset)) 

{'model': DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=Fals

In [78]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: end_line, start_column, start_line, end_column, file. If end_line, start_column, start_line, end_column, file are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9006
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

----
## Model Testing
----

In [32]:
prediction = trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: start_line, file, end_line, start_column, end_column. If start_line, file, end_line, start_column, end_column are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 111200
  Batch size = 128


In [33]:
prediction

PredictionOutput(predictions=array([[ 4.0680594 , -4.4672275 ],
       [ 0.72966844, -0.550992  ],
       [-5.86103   ,  5.4859776 ],
       ...,
       [-5.3612723 ,  4.909022  ],
       [ 0.6064074 , -0.45148978],
       [ 0.1363936 , -0.06151958]], dtype=float32), label_ids=array([0, 0, 1, ..., 1, 0, 0]), metrics={'test_loss': 0.35222548246383667, 'test_Accuracy': 0.8443255395683453, 'test_Precision': 0.8651927589035348, 'test_Recall': 0.8157553956834532, 'test_F1 Score': 0.8397470908973089, 'test_runtime': 149.0413, 'test_samples_per_second': 746.102, 'test_steps_per_second': 5.831})

In [34]:
labels = prediction.label_ids
labels

array([0, 0, 1, ..., 1, 0, 0])

In [35]:
preds = prediction.predictions.argmax(-1)
preds

array([0, 0, 1, ..., 1, 0, 0])

In [36]:
f1_score(labels, preds)

0.8397470908973089

In [39]:
test_df

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels
0,cleaned/poropy/poropy/nucleardata/nucleardata_...,cWABA92[11],*,"pow(R,3)",double,double,BINARY_OPERATOR,BINARY_OPERATOR,482,248,482,268,0
1,cleaned/quagga/lib/plist.c,maxseq,<,pentry->seq,int,int,IF_STMT,COMPOUND_STMT,389,11,389,31,0
2,cleaned/c30-linux/pic30-binutils/acme/gas/conf...,"strcasecmp(start,""a2"")",>,0,int,int,IF_STMT,IF_STMT,1221,14,1221,43,1
3,cleaned/fontforge/fontforge/scstylesui.c,100.0,*,hs->counter_percent,double,<dependent type>,CALL_EXPR,COMPOUND_STMT,2729,30,2729,56,0
4,cleaned/open-watcom/bld/wv/c/dlgscomp.c,last,/,first,unsigned int,unsigned int,BINARY_OPERATOR,COMPOUND_STMT,92,15,92,27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111195,cleaned/uhub/src/adc/message.c,*str,>=,'\\',int,int,IF_STMT,IF_STMT,913,13,913,25,1
111196,cleaned/blensor/source/blender/editors/space_s...,barh,+,1,int,int,BINARY_OPERATOR,FOR_STMT,431,18,431,26,1
111197,cleaned/freebsd/sys/cddl/contrib/opensolaris/u...,1,&,epbs,int,int,VAR_DECL,DECL_STMT,480,13,480,22,1
111198,cleaned/drone/lib/ffmpeg/libavcodec/jrevdct.c,d00,+,d10,int,int,PAREN_EXPR,BINARY_OPERATOR,1115,25,1115,34,0


In [41]:
test_df["predictions"] = preds

In [42]:
test_df

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels,predictions
0,cleaned/poropy/poropy/nucleardata/nucleardata_...,cWABA92[11],*,"pow(R,3)",double,double,BINARY_OPERATOR,BINARY_OPERATOR,482,248,482,268,0,0
1,cleaned/quagga/lib/plist.c,maxseq,<,pentry->seq,int,int,IF_STMT,COMPOUND_STMT,389,11,389,31,0,0
2,cleaned/c30-linux/pic30-binutils/acme/gas/conf...,"strcasecmp(start,""a2"")",>,0,int,int,IF_STMT,IF_STMT,1221,14,1221,43,1,1
3,cleaned/fontforge/fontforge/scstylesui.c,100.0,*,hs->counter_percent,double,<dependent type>,CALL_EXPR,COMPOUND_STMT,2729,30,2729,56,0,0
4,cleaned/open-watcom/bld/wv/c/dlgscomp.c,last,/,first,unsigned int,unsigned int,BINARY_OPERATOR,COMPOUND_STMT,92,15,92,27,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111195,cleaned/uhub/src/adc/message.c,*str,>=,'\\',int,int,IF_STMT,IF_STMT,913,13,913,25,1,1
111196,cleaned/blensor/source/blender/editors/space_s...,barh,+,1,int,int,BINARY_OPERATOR,FOR_STMT,431,18,431,26,1,0
111197,cleaned/freebsd/sys/cddl/contrib/opensolaris/u...,1,&,epbs,int,int,VAR_DECL,DECL_STMT,480,13,480,22,1,1
111198,cleaned/drone/lib/ffmpeg/libavcodec/jrevdct.c,d00,+,d10,int,int,PAREN_EXPR,BINARY_OPERATOR,1115,25,1115,34,0,0


In [46]:
positive_probabilities = softmax(prediction.predictions)[:, 1]
positive_probabilities

array([1.9637523e-04, 2.1743780e-01, 9.9998820e-01, ..., 9.9996531e-01,
       2.5771150e-01, 4.5068255e-01], dtype=float32)

In [47]:
test_df["positive_probabilities"] = positive_probabilities
test_df

Unnamed: 0,file,left,operator,right,type_left,type_right,parent,grandparent,start_line,start_column,end_line,end_column,labels,predictions,positive_probabilities
0,cleaned/poropy/poropy/nucleardata/nucleardata_...,cWABA92[11],*,"pow(R,3)",double,double,BINARY_OPERATOR,BINARY_OPERATOR,482,248,482,268,0,0,0.000196
1,cleaned/quagga/lib/plist.c,maxseq,<,pentry->seq,int,int,IF_STMT,COMPOUND_STMT,389,11,389,31,0,0,0.217438
2,cleaned/c30-linux/pic30-binutils/acme/gas/conf...,"strcasecmp(start,""a2"")",>,0,int,int,IF_STMT,IF_STMT,1221,14,1221,43,1,1,0.999988
3,cleaned/fontforge/fontforge/scstylesui.c,100.0,*,hs->counter_percent,double,<dependent type>,CALL_EXPR,COMPOUND_STMT,2729,30,2729,56,0,0,0.047171
4,cleaned/open-watcom/bld/wv/c/dlgscomp.c,last,/,first,unsigned int,unsigned int,BINARY_OPERATOR,COMPOUND_STMT,92,15,92,27,1,1,0.999975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111195,cleaned/uhub/src/adc/message.c,*str,>=,'\\',int,int,IF_STMT,IF_STMT,913,13,913,25,1,1,0.999978
111196,cleaned/blensor/source/blender/editors/space_s...,barh,+,1,int,int,BINARY_OPERATOR,FOR_STMT,431,18,431,26,1,0,0.169280
111197,cleaned/freebsd/sys/cddl/contrib/opensolaris/u...,1,&,epbs,int,int,VAR_DECL,DECL_STMT,480,13,480,22,1,1,0.999965
111198,cleaned/drone/lib/ffmpeg/libavcodec/jrevdct.c,d00,+,d10,int,int,PAREN_EXPR,BINARY_OPERATOR,1115,25,1115,34,0,0,0.257711


In [48]:
test_df.to_csv("WBO_test_dataset_predictions.csv", sep="\t", index=False)