<a href="https://colab.research.google.com/github/VellummyilumVinoth/Train_And_Test_Using_ALBERT/blob/main/FinetunedAlbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, htt

In [2]:
import pandas as pd
import torch
import os
from transformers import AlbertTokenizer, AlbertForMaskedLM, AdamW, Trainer, TrainingArguments, DataCollatorForLanguageModeling

class VariableNamesDataset(torch.utils.data.Dataset):
    def __init__(self, variable_names, source_statements, tokenizer):
        self.variable_names = variable_names
        self.source_statements = source_statements
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.source_statements)

    def __getitem__(self, idx):
        variable_name = str(self.variable_names[idx])
        source_statement = str(self.source_statements[idx])
        
        # Combine variable name and source statement
        input_text = variable_name + ' ' + source_statement
        
        # Tokenize the input text
        input_ids = self.tokenizer.encode(
            input_text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=128
        )

        return torch.tensor(input_ids)

# Load the dataset
df = pd.read_csv('output.csv',names=['labels','input_ids'])

# Load the ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')

# Extract variable names and source statements
variable_names = df['labels'].tolist()
source_statements = df['input_ids'].tolist()

# Tokenize source statements
encoded_inputs = tokenizer(source_statements, padding=True, truncation=True, max_length=128)

# Prepare the data for training
dataset = VariableNamesDataset(variable_names, source_statements, tokenizer)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

In [4]:
# Prepare the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.5
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = "steps",   # Evaluation and Save happens every X steps
    eval_steps = 200,                # Evaluation happens every X steps
    save_steps = 500,               # Save checkpoint every X steps
    num_train_epochs = 3,            # Total number of training epochs
    learning_rate = 5e-5,            # Learning rate
    per_device_train_batch_size=32,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps = 500,              # Number of warmup steps
    weight_decay = 0.01,             # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=1000,              # Log every X steps
)

# Instantiate the Trainer class and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)
trainer.train()

# Save the trained model and tokenizer
output_dir = './finetuned_albert'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


***** Running training *****
  Num examples = 63
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6
  Number of trainable parameters = 11221680


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./finetuned_albert/config.json
Model weights saved in ./finetuned_albert/pytorch_model.bin
tokenizer config file saved in ./finetuned_albert/tokenizer_config.json
Special tokens file saved in ./finetuned_albert/special_tokens_map.json


('./finetuned_albert/tokenizer_config.json',
 './finetuned_albert/special_tokens_map.json',
 './finetuned_albert/spiece.model',
 './finetuned_albert/added_tokens.json')

In [5]:
import torch

# Load the trained model and tokenizer
output_dir = './finetuned_albert'

model = AlbertForMaskedLM.from_pretrained(output_dir)
tokenizer = AlbertTokenizer.from_pretrained(output_dir)

# Define a sample masked statement
masked_statement = 'string[] [MASK] = getCountries();'

# Tokenize the masked statement
input_ids = tokenizer.encode(masked_statement, add_special_tokens=True, return_tensors='pt')

# Find the position of the masked token
masked_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0].item()

# Generate predictions for the masked token
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs[0]

# Get the top 5 predictions and their probability scores
probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)
top_k = torch.topk(probs, k=5)

# Print the top 5 predictions and their probability scores
for i, idx in enumerate(top_k.indices):
    token = tokenizer.convert_ids_to_tokens([idx])[0]
    print(f"Prediction {i+1}: {token}, Probability: {top_k.values[i].item()}")


loading configuration file ./finetuned_albert/config.json
Model config AlbertConfig {
  "_name_or_path": "albert-base-v2",
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "vocab_size": 30000
}

loading weights file ./finetuned_albert/pytorch_model.bin
All model checkpoint weights we

Prediction 1: ];, Probability: 0.8316342234611511
Prediction 2: [, Probability: 0.06505463272333145
Prediction 3: =, Probability: 0.014633656479418278
Prediction 4: ], Probability: 0.014532271772623062
Prediction 5: ▁[, Probability: 0.009015104733407497
