<a href="https://colab.research.google.com/github/VellummyilumVinoth/Train_And_Test_Using_ALBERT/blob/main/Copy_of_FinetunedAlbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
!pip install transformers accelerate transformers[onnx]



In [22]:
import torch

# Enable CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
from transformers import AlbertForMaskedLM,RobertaTokenizerFast
from sklearn.metrics import accuracy_score

# Load the Roberta tokenizer and ALBERT model
tokenizer = RobertaTokenizerFast.from_pretrained('huggingface/CodeBERTa-small-v1')

model = AlbertForMaskedLM.from_pretrained('albert-base-v2')

model.to(device)  # Move the model to the specified device


Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AlbertForMaskedLM(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=1024, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=

In [24]:
import csv

variable_names = []
statements = []

with open('/content/drive/MyDrive/output.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        # Check if the row contains at least two columns
        if len(row) >= 2:
            # Append the variable name and source statement to their respective lists
            variable_names.append(row[0].lower())
            statements.append(row[1].replace(';', '').lower())  # Remove the ';' symbol

In [25]:
import re

def clean_text(line):
    line = re.sub(r'-+',' ',line)
    line = re.sub(r'[^a-zA-Z, ]+'," ",line)
    line = re.sub(r'[ ]+'," " ,line)
    line += ""
    return line

source_statements = []
len_lst = []
for line in statements:
    if len(line.split(" ")) >=0:
        line = clean_text(line)
        source_statements.append(line)
        len_lst.append(len(line.split(" ")))


In [26]:
import pandas as pd
import torch
import os
import re
from transformers import AdamW, Trainer, TrainingArguments, DataCollatorForLanguageModeling

import random

# Create the train and test datasets
class VariableNamesDataset(torch.utils.data.Dataset):
    def __init__(self, variable_names, source_statements, tokenizer, mask_probability):
        self.variable_names = variable_names
        self.source_statements = source_statements
        self.tokenizer = tokenizer
        self.mask_probability = mask_probability

    def __len__(self):
        return len(self.source_statements)

    def __getitem__(self, idx):
        variable_name = str(self.variable_names[idx])
        source_statement = str(self.source_statements[idx])

        # Tokenize the variable name and source statement
        variable_name_tokens = self.tokenizer.tokenize(variable_name)
        source_statement_tokens = self.tokenizer.tokenize(source_statement)

        variable_name_tokens = [token.replace('Ġ', '').lower() for token in variable_name_tokens]
        source_statement_tokens = [token.replace('Ġ', '').lower() for token in source_statement_tokens]

        # Select a variable name tokens to mask
        variable_name_indices = [i for i, token in enumerate(source_statement_tokens) if token in variable_name_tokens]

        num_to_mask = int(len(variable_name_indices) * self.mask_probability)
        indices_to_mask = random.sample(variable_name_indices, num_to_mask)

        # Replace the selected variable name tokens with the [MASK] token
        for i in indices_to_mask:
            source_statement_tokens[i] = '<mask>'

        masked_source_statement = ' '.join(source_statement_tokens)

        # Tokenize the masked source statement
        input_ids = self.tokenizer.encode(
            masked_source_statement,
            add_special_tokens=False,
            truncation=True,
            padding='max_length',
            max_length=32
        )

        # Return both input_ids and labels (same as input_ids for masked language modeling)
        return {
            'input_ids': torch.tensor(input_ids),
            'labels': torch.tensor([input_ids[i] if i not in indices_to_mask else -100 for i in range(len(input_ids))]),
        }

In [27]:
# Split the data into train and test sets
train_size = int(len(source_statements) * 0.9)
train_variable_names = variable_names[:train_size]
train_source_statements = source_statements[:train_size]
test_variable_names = variable_names[train_size:]
test_source_statements = source_statements[train_size:]

In [28]:
# Create the train and test datasets
train_dataset = VariableNamesDataset(train_variable_names, train_source_statements, tokenizer, mask_probability=1)

test_dataset = VariableNamesDataset(test_variable_names, test_source_statements, tokenizer, mask_probability = 0)

In [29]:
train_dataset[3]

{'input_ids': tensor([ 710, 1416,  741,  324,    4,  827, 5822, 7831, 3161, 5822,  973,  225,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]),
 'labels': tensor([ 710, 1416,  741, -100,    4,  827, 5822, 7831, 3161, 5822,  973,  225,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1])}

In [30]:
test_dataset[25]

{'input_ids': tensor([ 492, 3161,  664, 3161, 1411, 3161, 2053, 3161, 1574, 3161,  664,  349,
          338,  821,  225,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]),
 'labels': tensor([ 492, 3161,  664, 3161, 1411, 3161, 2053, 3161, 1574, 3161,  664,  349,
          338,  821,  225,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1])}

In [31]:
import torch
torch.cuda.empty_cache

<function torch.cuda.memory.empty_cache() -> None>

In [32]:
# Prepare the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.5
)

model.resize_token_embeddings(len(tokenizer))

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = "steps",   # Evaluation and Save happens every X steps
    eval_steps = 50,                 # Evaluation happens every X steps
    save_steps = 1000,               # Save checkpoint every X steps
    num_train_epochs = 15,           # Total number of training epochs
    learning_rate = 5e-5,            # Learning rate
    per_device_train_batch_size=2,  # Batch size per device during training
    per_device_eval_batch_size=4,   # Batch size for evaluation
    warmup_steps=500,
    weight_decay = 0.01,             # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=1000,              # Log every X steps
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    fp16=True,                      # Enable mixed-precision training
)

# Instantiate the Trainer class and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Clear GPU memory
import torch
torch.cuda.empty_cache()

# Train the model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,No log,18.460411
100,No log,16.871851
150,No log,10.666631
200,No log,9.240643
250,No log,8.45941
300,No log,7.900672
350,No log,7.193113
400,No log,7.042408
450,No log,6.75307
500,No log,6.738224


TrainOutput(global_step=9945, training_loss=5.8932673159407996, metrics={'train_runtime': 8077.3174, 'train_samples_per_second': 9.859, 'train_steps_per_second': 1.231, 'total_flos': 197203295514624.0, 'train_loss': 5.8932673159407996, 'epoch': 14.98})

In [33]:
import torch

torch.cuda.empty_cache()  # Clear GPU memory

# Evaluate the model on the test dataset
predictions = trainer.predict(test_dataset)
predictions = predictions.predictions.argmax(-1)  # Get the predicted token indices
true_labels = predictions[0]  # Assuming 'predictions' is a tuple

# Calculate accuracy score
accuracy = accuracy_score(true_labels.flatten(), test_dataset[:]['input_ids'].flatten())

# Convert accuracy to percentage and round off to two decimal places
accuracy_percentage = round(accuracy * 100, 2)

print(f"Accuracy Score: {accuracy_percentage}%")

Token indices sequence length is longer than the specified maximum sequence length for this model (2100 > 512). Running this sequence through the model will result in indexing errors


Accuracy Score: 0.0%


In [34]:
import os

output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

trainer.model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)
data_collator.tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/fine-tuned-albert5/tokenizer_config.json',
 '/content/drive/MyDrive/fine-tuned-albert5/special_tokens_map.json',
 '/content/drive/MyDrive/fine-tuned-albert5/vocab.json',
 '/content/drive/MyDrive/fine-tuned-albert5/merges.txt',
 '/content/drive/MyDrive/fine-tuned-albert5/added_tokens.json',
 '/content/drive/MyDrive/fine-tuned-albert5/tokenizer.json')

In [35]:
import torch
from transformers import AlbertForMaskedLM,RobertaTokenizerFast
from tabulate import tabulate

# Load the trained model and tokenizer
output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')
model = AlbertForMaskedLM.from_pretrained(output_dir)
tokenizer = RobertaTokenizerFast.from_pretrained(output_dir)

# Define a sample masked statement
masked_statement = "int <mask> = getCount();"

# Tokenize the masked statement
input_ids = tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')

# Find the position of the masked token
masked_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0].item()

# Generate predictions for the masked token using the fine-tuned model
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0]

# Get the top 5 predictions and their probability scores from the fine-tuned model
probs_ft = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
top_k_ft = torch.topk(probs_ft, k=5)

# Create a table with the top predictions and their probabilities from both models
table = [["Fine-Tuned Model", f"{tokenizer.mask_token}"] + [tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower() for idx in top_k_ft.indices],
         ["Probability", ""] + [f"{probs_ft[idx].item():.4f}" for idx in top_k_ft.indices]]

# Print the table
print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))


╒════════════════════╤══════════╤════════╤════════╤══════════╤════════╤══════════╕
│ Fine-Tuned Model   │ <mask>   │        │      , │   string │    int │   string │
╞════════════════════╪══════════╪════════╪════════╪══════════╪════════╪══════════╡
│ Probability        │          │ 0.1188 │ 0.0662 │   0.0305 │ 0.0279 │    0.018 │
╘════════════════════╧══════════╧════════╧════════╧══════════╧════════╧══════════╛


In [36]:
import torch
from transformers import AlbertForMaskedLM,RobertaTokenizerFast

# Load the trained model and tokenizer
output_dir = os.path.expanduser('/content/drive/MyDrive/fine-tuned-albert5')
finetuned_model = AlbertForMaskedLM.from_pretrained(output_dir)
finetuned_tokenizer = RobertaTokenizerFast.from_pretrained(output_dir)

# Load the ALBERT model and tokenizer
base_model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
base_tokenizer = RobertaTokenizerFast.from_pretrained('huggingface/CodeBERTa-small-v1')

base_model.resize_token_embeddings(len(base_tokenizer))

# Define a list of sample masked statements
masked_statements = ["int <mask> = getCount();", "Student <mask>;", "Person <mask> = {name: 'John'};", "int[] <mask> = getNumbers();", "Person[] <mask> = getPersons();", "Person[] <mask> = getPersons(10, 'Sales');"]

# Loop through each masked statement and generate predictions for both models
for masked_statement in masked_statements:
    print(f"Masked statement: {masked_statement}\n")
    # Tokenize the masked statement for the fine-tuned model
    input_ids = finetuned_tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')
    masked_token_index = torch.where(input_ids == finetuned_tokenizer.mask_token_id)[1][0].item()
    with torch.no_grad():
        outputs = finetuned_model(input_ids)
        predictions = outputs[0]
    probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
    top_k = torch.topk(probs, k=5)
    # Print the top 5 predictions and their probability scores for the fine-tuned model
    print("Fine-tuned model predictions:")
    print("{:<20} {:<20}".format('Prediction', 'Probability'))
    for i, idx in enumerate(top_k.indices):
        token = finetuned_tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower()
        prob = top_k.values[i].item()
        print("{:<20} {:.4f}".format(token, prob))
    print("\n")

    # Tokenize the masked statement for the base model
    input_ids = base_tokenizer.encode(masked_statement, add_special_tokens=False, return_tensors='pt')
    masked_token_index = torch.where(input_ids == base_tokenizer.mask_token_id)[1][0].item()
    with torch.no_grad():
        outputs = base_model(input_ids)
        predictions = outputs[0]
    probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
    top_k = torch.topk(probs, k=5)
    # Print the top 5 predictions and their probability scores for the base model
    print("Base model predictions:")
    print("{:<20} {:<20}".format('Prediction', 'Probability'))
    for i, idx in enumerate(top_k.indices):
        token = base_tokenizer.convert_ids_to_tokens([idx])[0].replace('Ġ', '').lower()
        prob = top_k.values[i].item()
        print("{:<20} {:.4f}".format(token, prob))
    print("\n")


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Masked statement: int <mask> = getCount();

Fine-tuned model predictions:
Prediction           Probability         
                     0.1188
,                    0.0662
string               0.0305
int                  0.0279
string               0.0180


Base model predictions:
Prediction           Probability         
§                    0.0753
(?:[                 0.0576
=                    0.0543
leg                  0.0459
isassignablefrom     0.0423


Masked statement: Student <mask>;

Fine-tuned model predictions:
Prediction           Probability         
                     0.1188
,                    0.0662
string               0.0305
int                  0.0279
string               0.0180


Base model predictions:
Prediction           Probability         
§                    0.0751
(?:[                 0.0576
=                    0.0539
leg                  0.0463
isassignablefrom     0.0412


Masked statement: Person <mask> = {name: 'John'};

Fine-tuned model predictio