In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# os.environ["WANDB_DISABLED"] = "true"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
# ! wget https://zenodo.org/record/3628775/files/c-corpus.tar.gz?download=1
# ! tar -xzf "/kaggle/working/c-corpus.tar.gz?download=1"
# ! rm /kaggle/working/c-corpus.tar.gz?download=1

In [None]:
! pip install transformers
! pip install datasets

from transformers import AutoModelForSequenceClassification
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration # RobertaTokenizer

In [None]:
id2label = {0: "CORRECT", 1: "BUGGY"}
label2id = {"CORRECT": 0, "BUGGY": 1}

In [None]:
# tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')
tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
import os

def get_tokenizer_training_corpus():
    total = 0
    for root, dirs, files in os.walk("/kaggle/working/cleaned"):
        for file in files:
            if file.endswith(".c"):
                file_path = os.path.join(root, file)

                with open(file_path, 'rb') as f:
                    content = str(f.read())
                    total += 1
                    if total % 10000 == 0:
                        print(total)
                    yield [content]
                        
#     dataset = train_dataset
#     for start_idx in range(0, len(dataset), 1000):
#         samples = dataset[start_idx : start_idx + 1000]
#         yield samples["full_text"]

In [None]:
tokenizer_training_corpus = get_tokenizer_training_corpus()
tokenizer_training_corpus

In [None]:
# fine_tuned_tokenizer = tokenizer.train_new_from_iterator(tokenizer_training_corpus, vocab_size=20_000)

In [None]:
!pip install huggingface_hub
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token(YOUR_HUGGING_FACE_TOKEN_HERE)"

# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# fine_tuned_tokenizer.save_pretrained("codet5-base")
# fine_tuned_tokenizer.push_to_hub("codet5-base")

In [None]:
# tokenizer = AutoTokenizer.from_pretrained('dipudl/codet5-base')

## Training

In [None]:
df = pd.read_csv("/kaggle/input/c-code-from-c-code-corpus/data_function _swap_final_mid_term.txt", sep="\t")

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno
import pandas as pd

msno.matrix(df)

In [None]:
# visualize missing data using bar charts
msno.bar(df)

In [None]:
df.drop_duplicates(ignore_index=True, inplace=True)

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
# df['function_name'] = df['function_name'].fillna('[UNK]')
# df['arg1']=df['arg1'].fillna('[UNK]')
# df['arg2']=df['arg2'].fillna('[UNK]')
# df['arg_type']=df['arg_type'].fillna('[UNK]')
# df['param1']=df['param1'].fillna('[UNK]')
# df['param2']=df['param2'].fillna('[UNK]')

df['function_name'] = df['function_name'].fillna('')
df['arg1']=df['arg1'].fillna('')
df['arg2']=df['arg2'].fillna('')
df['arg_type']=df['arg_type'].fillna('')
df['param1']=df['param1'].fillna('')
df['param2']=df['param2'].fillna('')

In [None]:
df.isna().sum()

In [None]:
df['full_text'] = df['function_name'] + ' [SEP] '+ df['arg1'] + ' [SEP] '+ df['arg2'] + ' [SEP] '+ df['arg_type'] + ' [SEP] '+ df['param1'] + ' [SEP] ' + df['param2']
# df['full_text'] = df['function_name'] + ' '+ df['arg1'] + ' '+ df['arg2'] + ' '+ df['arg_type'] + ' '+ df['param1'] + ' ' + df['param2']

In [None]:
df.drop(['function_name','arg1','arg2','arg_type','param1','param2'],axis=1,inplace=True)

In [None]:
df

In [None]:
def tokenize_text(examples):
    return tokenizer(examples["full_text"], truncation=True, max_length=100, padding=True)

In [None]:
df = df.sample(frac = 1) # shuffling the dataset
df.reset_index(drop=True,inplace=True)
df

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1)

In [None]:
df_train

In [None]:
df_test

In [None]:
train_dataset = Dataset.from_pandas(df_train)
train_dataset

In [None]:
test_dataset = Dataset.from_pandas(df_test)
test_dataset

In [None]:
train_dataset = train_dataset.map(tokenize_text, batched=True, drop_last_batch=True, remove_columns=["full_text", "__index_level_0__"])
train_dataset

In [None]:
test_dataset = test_dataset.map(tokenize_text, batched=True, drop_last_batch=True, remove_columns=["full_text", "__index_level_0__"])
test_dataset

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": accuracy, "f1": f1}

In [None]:
train_dataset = train_dataset.train_test_split(test_size=0.1)
train_dataset

In [None]:
from transformers import TrainingArguments

batch_size = 32

logging_steps = len(train_dataset["train"]) // batch_size
output_dir = "function-arg-swap-model"
training_args = TrainingArguments(output_dir,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps,
                                  save_strategy="epoch",
                                  # save_steps=10000,
                                  # fp16=True,
                                  push_to_hub=False,
                                  report_to="wandb"
                                )

In [None]:
from transformers import Trainer, DefaultDataCollator

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset["train"],         
    eval_dataset=train_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
history = trainer.train()

In [None]:
trainer.save_model("codet5-87.7f1-10ep")

-----
## Testing
-----

In [None]:
prediction = trainer.predict(test_dataset)

In [None]:
prediction

In [None]:
predictions, labels, _ = trainer.predict(test_dataset)

In [None]:
predictions = np.argmax(predictions, axis=1)
predictions

In [None]:
f1_score(predictions, test_dataset["labels"])

In [None]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix: \n", confusion_matrix(predictions, test_dataset["labels"]))

In [None]:
# Import the required libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Data
a = [[3388,519],
 [530,3563]]

# Select Confusion Matrix Size
plt.figure(figsize = (10,8))

# Select Confusion Matrix Size
plt.figure(figsize = (10,8))

# Create Confusion Matrix and show percentages
b = sns.heatmap(a/np.sum(a), annot=True, fmt='.1%')

# Set the Title
b.set(title='Confusion Matrix')

# Set the Labels
b.set(xlabel='Predicted', ylabel='Actual')

# Display the Confusion Matrix
plt.show()

-----
## Inference
-----

In [None]:
! ls /kaggle/working/function-arg-swap-model

In [None]:
train