<a href="https://colab.research.google.com/github/VellummyilumVinoth/Train_And_Test_Using_ALBERT/blob/main/FinetunedAlbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

print(torch.__version__)


2.0.1+cu118


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import locale
print(locale.getpreferredencoding())

UTF-8


In [5]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [6]:
!pip install transformers accelerate transformers[onnx]



In [7]:
from transformers import AlbertForMaskedLM,RobertaTokenizerFast
from sklearn.metrics import accuracy_score, f1_score

# Load the Roberta tokenizer and ALBERT model
tokenizer = RobertaTokenizerFast.from_pretrained('huggingface/CodeBERTa-small-v1')

model = AlbertForMaskedLM.from_pretrained('albert-base-v2')

In [8]:
import csv

variable_names = []
statements = []

with open('/content/drive/MyDrive/output.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # Check if the row contains at least two columns
        if len(row) >= 2:
            # Append the variable name and source statement to their respective lists
            variable_names.append(row[0].lower())
            statements.append(row[1].replace(';', '').lower())  # Remove the ';' symbol

In [9]:
import re

def clean_text(line):
    line = re.sub(r'-+',' ',line)
    line = re.sub(r'[^a-zA-Z, ]+'," ",line)
    line = re.sub(r'[ ]+'," " ,line)
    line += ""
    return line

source_statements = []
len_lst = []
for line in statements:
    if len(line.split(" ")) >=0:
        line = clean_text(line)
        source_statements.append(line)
        len_lst.append(len(line.split(" ")))


In [10]:
import pandas as pd
import torch
import os
import re
from transformers import AdamW, Trainer, TrainingArguments, DataCollatorForLanguageModeling

import random

class VariableNamesDataset(torch.utils.data.Dataset):
    def __init__(self, variable_names, source_statements, tokenizer, mask_probability):
        self.variable_names = variable_names
        self.source_statements = source_statements
        self.tokenizer = tokenizer
        self.mask_probability = mask_probability

    def __len__(self):
        return len(self.source_statements)

    def __getitem__(self, idx):
        variable_name = str(self.variable_names[idx])
        source_statement = str(self.source_statements[idx])

        # print("variable names: ",len(variable_name))
        # print("source statements: ",len(source_statement))

        # Tokenize the variable name and source statement
        variable_name_tokens = self.tokenizer.tokenize(variable_name)
        source_statement_tokens = self.tokenizer.tokenize(source_statement)

        variable_name_tokens = [token.replace('Ġ', '').lower() for token in variable_name_tokens]
        source_statement_tokens = [token.replace('Ġ', '').lower() for token in source_statement_tokens]

        # print("Variable name tokens:", variable_name_tokens)
        # print("Source statement tokens:", source_statement_tokens)

        # Select a variable name tokens to mask
        variable_name_indices = [i for i, token in enumerate(source_statement_tokens) if token in variable_name_tokens]

        # print("Variable name indices:", variable_name_indices)

        num_to_mask = int(len(variable_name_indices)* self.mask_probability)
        indices_to_mask = random.sample(variable_name_indices, num_to_mask)

        # print("Num to mask:", num_to_mask)
        # print("Indices to mask:", indices_to_mask)

        # Replace the selected variable name tokens with the [MASK] token
        for i in indices_to_mask:
            source_statement_tokens[i] = '<mask>'

        masked_source_statement = ' '.join(source_statement_tokens)
        # print("Masked source statement:", masked_source_statement)

        # Tokenize the masked source statement
        input_ids = self.tokenizer.encode(
            masked_source_statement,
            add_special_tokens=False,
            truncation=True,
            padding='max_length',
            max_length=128
        )

        # print("Input IDs:", input_ids)

        return torch.tensor(input_ids)


In [11]:
# Split the data into train and test sets
train_size = int(len(source_statements) * 0.9)
train_variable_names = variable_names[:train_size]
train_source_statements = source_statements[:train_size]
test_variable_names = variable_names[train_size:]
test_source_statements = source_statements[train_size:]

In [12]:
# Create the train and test datasets
train_dataset = VariableNamesDataset(train_variable_names, train_source_statements, tokenizer, mask_probability=1)

test_dataset = VariableNamesDataset(test_variable_names, test_source_statements, tokenizer, mask_probability = 0)

In [13]:
train_dataset[3]

tensor([437,   4,   4, 548,   4, 325, 225,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1])

In [14]:
test_dataset[25]

tensor([2128, 2779, 2886,  679, 3671,  772,  679, 1867, 3161, 3671,  772,  679,
         225,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])

In [15]:
import torch
torch.cuda.empty_cache

<function torch.cuda.memory.empty_cache() -> None>

In [16]:
# Prepare the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.5
)

model.resize_token_embeddings(len(tokenizer))

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = "steps",   # Evaluation and Save happens every X steps
    eval_steps = 50,                 # Evaluation happens every X steps
    save_steps = 1000,               # Save checkpoint every X steps
    num_train_epochs = 15,           # Total number of training epochs
    learning_rate = 3e-5,            # Learning rate
    per_device_train_batch_size=32,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,
    weight_decay = 0.01,             # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=1000,              # Log every X steps
)

# Instantiate the Trainer class and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

trainer.train()



Step,Training Loss,Validation Loss
50,No log,10.869629
100,No log,9.815125
150,No log,9.213017
200,No log,8.827657
250,No log,8.633486
300,No log,8.033744
350,No log,7.615405
400,No log,7.423807
450,No log,7.332746
500,No log,7.199204


TrainOutput(global_step=1125, training_loss=5.445466281467014, metrics={'train_runtime': 974.0137, 'train_samples_per_second': 36.729, 'train_steps_per_second': 1.155, 'total_flos': 201609940377600.0, 'train_loss': 5.445466281467014, 'epoch': 15.0})

In [17]:
import torch
torch.cuda.empty_cache

<function torch.cuda.memory.empty_cache() -> None>

In [18]:
import os

output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

trainer.model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)
data_collator.tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/fine-tuned-albert5/tokenizer_config.json',
 '/content/drive/MyDrive/fine-tuned-albert5/special_tokens_map.json',
 '/content/drive/MyDrive/fine-tuned-albert5/vocab.json',
 '/content/drive/MyDrive/fine-tuned-albert5/merges.txt',
 '/content/drive/MyDrive/fine-tuned-albert5/added_tokens.json',
 '/content/drive/MyDrive/fine-tuned-albert5/tokenizer.json')

In [19]:
import torch
from transformers import AlbertForMaskedLM,RobertaTokenizerFast
from tabulate import tabulate

# Load the trained model and tokenizer
output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')
model = AlbertForMaskedLM.from_pretrained(output_dir)
tokenizer = RobertaTokenizerFast.from_pretrained(output_dir)

# Define a sample masked statement
masked_statement = "int <mask> = getCount();"

# Tokenize the masked statement
input_ids = tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')

# Find the position of the masked token
masked_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0].item()

# Generate predictions for the masked token using the fine-tuned model
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0]

# Get the top 5 predictions and their probability scores from the fine-tuned model
probs_ft = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
top_k_ft = torch.topk(probs_ft, k=5)

# Create a table with the top predictions and their probabilities from both models
table = [["Fine-Tuned Model", f"{tokenizer.mask_token}"] + [tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower() for idx in top_k_ft.indices],
         ["Probability", ""] + [f"{probs_ft[idx].item():.4f}" for idx in top_k_ft.indices]]

# Print the table
print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))


╒════════════════════╤══════════╤═════════╤════════╤══════════╤════════╤═══════════╕
│ Fine-Tuned Model   │ <mask>   │   check │      m │   result │    res │   binding │
╞════════════════════╪══════════╪═════════╪════════╪══════════╪════════╪═══════════╡
│ Probability        │          │  0.6813 │ 0.0416 │   0.0332 │ 0.0125 │    0.0089 │
╘════════════════════╧══════════╧═════════╧════════╧══════════╧════════╧═══════════╛


In [20]:
import torch
from transformers import AlbertForMaskedLM,RobertaTokenizerFast

# Load the trained model and tokenizer
output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')
finetuned_model = AlbertForMaskedLM.from_pretrained(output_dir)
finetuned_tokenizer = RobertaTokenizerFast.from_pretrained(output_dir)

# Load the ALBERT model and tokenizer
base_model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
base_tokenizer = RobertaTokenizerFast.from_pretrained('huggingface/CodeBERTa-small-v1')

base_model.resize_token_embeddings(len(base_tokenizer))

# Define a list of sample masked statements
masked_statements = ["int <mask> = getCount();", "Student <mask>;", "Person <mask> = {name: 'John'};", "int[] <mask> = getNumbers();", "Person[] <mask> = getPersons();", "Person[] <mask> = getPersons(10, 'Sales');"]

# Loop through each masked statement and generate predictions for both models
for masked_statement in masked_statements:
    print(f"Masked statement: {masked_statement}\n")
    # Tokenize the masked statement for the fine-tuned model
    input_ids = finetuned_tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')
    masked_token_index = torch.where(input_ids == finetuned_tokenizer.mask_token_id)[1][0].item()
    with torch.no_grad():
        outputs = finetuned_model(input_ids)
        predictions = outputs[0]
    probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
    top_k = torch.topk(probs, k=5)
    # Print the top 5 predictions and their probability scores for the fine-tuned model
    print("Fine-tuned model predictions:")
    print("{:<20} {:<20}".format('Prediction', 'Probability'))
    for i, idx in enumerate(top_k.indices):
        token = finetuned_tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower()
        prob = top_k.values[i].item()
        print("{:<20} {:.4f}".format(token, prob))
    print("\n")

    # Tokenize the masked statement for the base model
    input_ids = base_tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')
    masked_token_index = torch.where(input_ids == base_tokenizer.mask_token_id)[1][0].item()
    with torch.no_grad():
        outputs = base_model(input_ids)
        predictions = outputs[0]
    probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
    top_k = torch.topk(probs, k=5)
    # Print the top 5 predictions and their probability scores for the base model
    print("Base model predictions:")
    print("{:<20} {:<20}".format('Prediction', 'Probability'))
    for i, idx in enumerate(top_k.indices):
        token = base_tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower()
        prob = top_k.values[i].item()
        print("{:<20} {:.4f}".format(token, prob))
    print("\n")


Masked statement: int <mask> = getCount();

Fine-tuned model predictions:
Prediction           Probability         
check                0.6813
m                    0.0416
result               0.0332
res                  0.0125
binding              0.0089


Base model predictions:
Prediction           Probability         
§                    0.0744
(?:[                 0.0553
=                    0.0517
leg                  0.0427
isassignablefrom     0.0412


Masked statement: Student <mask>;

Fine-tuned model predictions:
Prediction           Probability         
get                  0.0718
request              0.0358
i                    0.0301
single               0.0286
create               0.0275


Base model predictions:
Prediction           Probability         
§                    0.0723
(?:[                 0.0554
=                    0.0519
leg                  0.0445
isassignablefrom     0.0397


Masked statement: Person <mask> = {name: 'John'};

Fine-tuned model predictio