In [None]:
# Google Colab setup
!pip install transformers
!pip install torch
!pip install unsloth



In [None]:
import os
cache_dir_path = "/content/hf-cache"
os.environ["HF_HOME"] = "/content/hf-cache"

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marpitkaple13[0m ([33marpitkaple13-rbu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from datasets import Dataset, load_from_disk
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
model_ckpt = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
clf_ds = load_from_disk('/content/drive/MyDrive/major_project_models/dataset_tokenized_distilbert/train')
val_clf_ds = load_from_disk('/content/drive/MyDrive/major_project_models/dataset_tokenized_distilbert/valid')
test_clf_ds = load_from_disk('/content/drive/MyDrive/major_project_models/dataset_tokenized_distilbert/test')

def preprocess_function(examples):
    return tokenizer(examples["code"], truncation=True, padding=True)
tokenized_clf = clf_ds.map(preprocess_function, batched=True)
tokenized_val_clf = val_clf_ds.map(preprocess_function, batched=True)
tokenized_test_clf = test_clf_ds.map(preprocess_function, batched=True)

def rename_cols(example):
    return {"text": example['code'], "label": example['y']}
tokenized_clf_rename = tokenized_clf.map(rename_cols, remove_columns=['id', 'y', 'idx', 'code', 'seq_len'])
val_tokenized_clf_rename = tokenized_val_clf.map(rename_cols, remove_columns=['id', 'y', 'idx', 'code', 'seq_len'])
test_tokenized_clf_rename = tokenized_test_clf.map(rename_cols, remove_columns=['id', 'y', 'idx', 'code', 'seq_len'])

In [None]:
test_tokenized_clf_rename

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'label'],
    num_rows: 6023
})

In [None]:
from datasets import DatasetDict

dataset = DatasetDict()
dataset['train'] = tokenized_clf_rename
dataset['val'] = val_tokenized_clf_rename
dataset['test'] = test_tokenized_clf_rename

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'label'],
        num_rows: 36750
    })
    val: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'label'],
        num_rows: 5956
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'label'],
        num_rows: 6023
    })
})

In [None]:
from transformers import AutoModel
import torch

model_ckpt = "microsoft/codebert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
dataset_hidden = dataset.map(extract_hidden_states, batched=True, batch_size = 8)
dataset_hidden["test"].column_names

['text', 'input_ids', 'attention_mask', 'label', 'hidden_state']

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 2
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 8
gradient_accumulation_steps = 8
# logging_steps = len(emotions_encoded["train"]) // batch_size
# Previous model: f"{model_ckpt}-finetuned-emotion"
model_name = f"/content/drive/MyDrive/bert/{model_ckpt}-codebert-1024"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  gradient_accumulation_steps=gradient_accumulation_steps,
                                  fp16=True,
                                  weight_decay=0.01,
                                  evaluation_strategy="steps",
                                  eval_steps=50,
                                  disable_tqdm=False,
                                  logging_steps=50,
                                  report_to='wandb',
                                  push_to_hub=False,
                                  log_level="error",
                                  max_steps = 1280
                                  )



In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  train_dataset=dataset_hidden["train"],
                  eval_dataset=dataset_hidden["val"],
                  tokenizer=tokenizer)

  trainer = Trainer(model=model, args=training_args,


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1
50,0.4362,0.378521,0.735393,0.770128
100,0.3579,0.341454,0.857623,0.805228
150,0.3536,0.317515,0.854097,0.78881
200,0.3359,0.354461,0.772834,0.796895
250,0.3404,0.342641,0.853425,0.813085
300,0.332,0.342822,0.810611,0.815886
350,0.3559,0.324665,0.853929,0.787138
400,0.3343,0.33399,0.828408,0.821572
450,0.3477,0.325292,0.854433,0.794412
500,0.3312,0.336468,0.82908,0.82307


TrainOutput(global_step=1280, training_loss=0.3376327522099018, metrics={'train_runtime': 3099.8338, 'train_samples_per_second': 26.427, 'train_steps_per_second': 0.413, 'total_flos': 2.15277465495552e+16, 'train_loss': 0.3376327522099018, 'epoch': 2.2263822377013494})