In [2]:
import os
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
# ! wget https://zenodo.org/record/3628775/files/c-corpus.tar.gz?download=1
# ! tar -xzf "/kaggle/working/c-corpus.tar.gz?download=1"
# ! rm /kaggle/working/c-corpus.tar.gz?download=1

In [4]:
# ! pip install transformers
# ! pip install datasets

from transformers import AutoModelForSequenceClassification
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer # T5ForConditionalGeneration # RobertaTokenizer

In [5]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [6]:
# tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')
# fine_tuned_tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/276k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/148k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [7]:
import os

def get_tokenizer_training_corpus():
    total = 0
    for root, dirs, files in os.walk("/kaggle/working/cleaned"):
        for file in files:
            if file.endswith(".c"):
                file_path = os.path.join(root, file)

                with open(file_path, 'rb') as f:
                    content = str(f.read())
                    total += 1
                    if total % 10000 == 0:
                        print(total)
                    yield [content]
                        
#     dataset = train_dataset
#     for start_idx in range(0, len(dataset), 1000):
#         samples = dataset[start_idx : start_idx + 1000]
#         yield samples["full_text"]

In [8]:
tokenizer_training_corpus = get_tokenizer_training_corpus()
tokenizer_training_corpus

<generator object get_tokenizer_training_corpus at 0x7ff702834050>

In [9]:
!pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

# from huggingface_hub import notebook_login

# notebook_login()

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [10]:
import pandas as pd
import missingno as mno

In [11]:
df_train = pd.read_csv("/kaggle/input/final-code-of-c-code-corpus-train-test-split/func_args_dataset_filtered_train.csv", sep="\t")
df_test = pd.read_csv("/kaggle/input/final-code-of-c-code-corpus-train-test-split/func_args_dataset_filtered_test.csv", sep="\t")

In [12]:
df_train = df_train[:20000]

In [13]:
df_val = df_test[:20000]

In [16]:
df_test = df_test[2000:4000]

In [18]:
df_train['function_name'] = df_train['function_name'].fillna('[UNK]')
df_train['arg1']=df_train['arg1'].fillna('[UNK]')
df_train['arg2']=df_train['arg2'].fillna('[UNK]')
df_train['arg_type']=df_train['arg_type'].fillna('[UNK]')
df_train['param1']=df_train['param1'].fillna('[UNK]')
df_train['param2']=df_train['param2'].fillna('[UNK]')

In [19]:
df_val['function_name'] = df_val['function_name'].fillna('[UNK]')
df_val['arg1']=df_val['arg1'].fillna('[UNK]')
df_val['arg2']=df_val['arg2'].fillna('[UNK]')
df_val['arg_type']=df_val['arg_type'].fillna('[UNK]')
df_val['param1']=df_val['param1'].fillna('[UNK]')
df_val['param2']=df_val['param2'].fillna('[UNK]')

In [20]:
df_test['function_name'] = df_test['function_name'].fillna('[UNK]')
df_test['arg1']=df_test['arg1'].fillna('[UNK]')
df_test['arg2']=df_test['arg2'].fillna('[UNK]')
df_test['arg_type']=df_test['arg_type'].fillna('[UNK]')
df_test['param1']=df_test['param1'].fillna('[UNK]')
df_test['param2']=df_test['param2'].fillna('[UNK]')

In [21]:
df_train['full_text'] = df_train['function_name'] + ' '+ df_train['arg1'] + ' '+ df_train['arg2'] + ' '+ df_train['arg_type'] + ' '+ df_train['param1'] + ' ' + df_train['param2']

In [22]:
df_val['full_text'] = df_val['function_name'] + ' '+ df_val['arg1'] + ' '+ df_val['arg2'] + ' '+ df_val['arg_type'] + ' '+ df_val['param1'] + ' ' + df_val['param2']

In [23]:
df_test['full_text'] = df_test['function_name'] + ' '+ df_test['arg1'] + ' '+ df_test['arg2'] + ' '+ df_test['arg_type'] + ' '+ df_test['param1'] + ' ' + df_test['param2']

In [None]:
df_train = df_train.sample(frac = 1) # shuffling the dataset
df_train.reset_index(drop=True,inplace=True)
df_train

In [None]:
df_test = df_test.sample(frac = 1) # shuffling the dataset
df_test.reset_index(drop=True,inplace=True)
df_test

In [None]:
df_validation = df_validation.sample(frac = 1) # shuffling the dataset
df_validation.reset_index(drop=True,inplace=True)
df_validation

In [None]:
train_dataset = Dataset.from_pandas(df_train)
train_dataset

In [None]:
validation_dataset = Dataset.from_pandas(df_validation)
validation_dataset

In [None]:
test_dataset = Dataset.from_pandas(df_test)
test_dataset

In [None]:
def tokenize_text(examples):
    return tokenizer(examples["full_text"], truncation=True, max_length=100, padding=True)

In [None]:
train_dataset

In [None]:
train_dataset = train_dataset.map(tokenize_text, batched=True, remove_columns=["full_text"])
train_dataset

In [None]:
validation_dataset = validation_dataset.map(tokenize_text, batched=True, remove_columns=["full_text"])
validation_dataset

In [None]:
test_dataset = test_dataset.map(tokenize_text, batched=True, remove_columns=["full_text"])
test_dataset

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

In [None]:
import math

In [None]:
from transformers import TrainingArguments

batch_size = 32

logging_steps = len(train_dataset) // batch_size
output_dir = "function-arg-swap-model-148k-files-365k-samples"
training_args = TrainingArguments(output_dir,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  save_strategy="steps",
                                  save_steps=math.ceil(len(train_dataset) / (batch_size * 2)) * 5,
                                  # fp16=True,
                                  push_to_hub=True,
                                )+