In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [1]:
%pip uninstall -y pyarrow requests

%pip install transformers datasets torch evaluate


Found existing installation: pyarrow 16.1.0
Uninstalling pyarrow-16.1.0:
  Successfully uninstalled pyarrow-16.1.0
Found existing installation: requests 2.32.3
Uninstalling requests-2.32.3:
  Successfully uninstalled requests-2.32.3
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests, pyarrow
Successfully installed pyarrow-16.1.0 requests-2.32.3


In [2]:
%pip install accelerate -U
import accelerate
print(accelerate.__version__)

0.32.1


In [1]:
import torch
torch.cuda.is_available()

True

In [1]:
from datasets import load_dataset, DatasetDict
import pandas as pd

# Load your XGLUE JavaScript dataset
# THE DATASET FROM CODE_X_GLUE IS ALREADY PREPROCESSED
dataset = load_dataset('code_x_glue_ct_code_to_text', 'javascript')

# Convert the dataset to Pandas DataFrames

train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])


In [2]:
import os

# Set the environment variable for CUDA memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch

# Your PyTorch code goes here


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
import torch
from transformers import EarlyStoppingCallback

# Model name
#load the model directly from Salesforce/codeT5 when you are training for the first time
#If fine tuning the models again then write the model_name as the previously saved model
model_name = "/data/T5/codeT53_54k"

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained(model_name)

print("Tokenizer and model loaded successfully.") #to verify if the model is loaded succefully or not

# Filter the datasets based on code_tokens_length

#it filters the data that has the length of code tokens as more than 512 in train data. you can either take code colm(which contain the actual code snippet) and then encode it
#  using tokenizer. OR even you can use codetokens colum to directly filter out the thing without encoding. Similarly it is done for test and validation data.
train_df['code_tokens_length'] = train_df['code'].apply(lambda x: len(tokenizer.encode(x)))#tokenize the code snippet in train data
sampled_train_df = train_df[train_df['code_tokens_length'] <= 512]# filter out the data by removing the data that have more than 512 tokens and store it in sampled_train_df

val_df['code_tokens_length'] = val_df['code'].apply(lambda x: len(tokenizer.encode(x)))#same explanation as in train data
sampled_val_df = val_df[val_df['code_tokens_length'] <= 512]

test_df['code_tokens_length'] = test_df['code'].apply(lambda x: len(tokenizer.encode(x)))
sampled_test_df = test_df[test_df['code_tokens_length'] <= 512]

# Check the number of available rows after filtering out
print(f"Available training samples: {len(sampled_train_df)}")
print(f"Available validation samples: {len(sampled_val_df)}")
print(f"Available test samples: {len(sampled_test_df)}")

# Sample based on the available data
#here min function is used. if suppose you want to train your model for 10K data then you need to write in 1st parameter as 10k. the 2nd parameter gigve the len of entire
#data sample available. the minimum of two will be considered. if you want to load full sample then give any number that is higher than the available sample.
n_train_samples = min(60000, len(sampled_train_df))
n_val_samples = min(5000, len(sampled_val_df))
n_test_samples = min(5000, len(sampled_test_df))

#the so sampled data is then given to sample function that helps in giving random data samples for training and not in order. if you are training the model on same data sample 
#again again ,it may lead to overfitting. thats the reason randonstate is used. when you just change the number of random_state ,the new 10k samples will be taken(though if you dont change the saple size)
sampled_train_df = sampled_train_df.sample(n=n_train_samples, random_state=123)
sampled_val_df = sampled_val_df.sample(n=n_val_samples, random_state=123)
sampled_test_df = sampled_test_df.sample(n=n_test_samples, random_state=123)

# Convert DataFrames back to Hugging Face Dataset for easy accesibility
train_data = Dataset.from_pandas(sampled_train_df)
val_data = Dataset.from_pandas(sampled_val_df)
test_data = Dataset.from_pandas(sampled_test_df)


#the final sampled data is stored in the form of dataset dictionary for easy oragnizability and accessibiblity
filtered_dataset = DatasetDict({
    'train': train_data,
    'validation': val_data,
    'test': test_data
})

#this line of code helps in utilizing of cuda(gpu server) if available,so that cpu and ram usage will be less
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the filtered dataset
def tokenize_function(examples):
    # Add a prompt before each code example
    prompt = "Summarize the following JavaScript code: "
    inputs = [prompt + code for code in examples['code']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['docstring'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)




# Training arguments
training_args = TrainingArguments(
    output_dir='/data/T5/result4',
    # resume_from_checkpoint=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=12,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir='/data/T5/result4',
    logging_steps=10,
    fp16=True,
    warmup_steps=500,
    save_total_limit=3
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)
# Train the model
trainer.train()

# Save the pretrained model with our data
model.save_pretrained('/data/T5/codeT54_54k')
tokenizer.save_pretrained('/data/T5/codeT54_54k')

# Evaluate the model
results = trainer.evaluate()
print(results)


Token indices sequence length is longer than the specified maximum sequence length for this model (732 > 512). Running this sequence through the model will result in indexing errors


Tokenizer and model loaded successfully.
Available training samples: 54770
Available validation samples: 3669
Available test samples: 3110


Map:   0%|          | 0/54770 [00:00<?, ? examples/s]



Map:   0%|          | 0/3669 [00:00<?, ? examples/s]

Map:   0%|          | 0/3110 [00:00<?, ? examples/s]

Map:   0%|          | 0/54770 [00:00<?, ? examples/s]

Map:   0%|          | 0/3669 [00:00<?, ? examples/s]

Map:   0%|          | 0/3110 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.4969,0.621673
2,0.5051,0.621418
3,0.4977,0.624025
4,0.4676,0.632316
5,0.4634,0.635072
6,0.4268,0.642205
7,0.4288,0.648374


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 0.6214179396629333, 'eval_runtime': 34.6305, 'eval_samples_per_second': 105.947, 'eval_steps_per_second': 6.642, 'epoch': 7.0}


In [42]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, RobertaTokenizer

# Load the trained model and tokenizer
model_path = '/data/T5/codeT54_54k'
tokenizer =  RobertaTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def generate_summary(code_snippet):
    # Tokenize the input code snippet
    inputs = tokenizer(code_snippet, return_tensors='pt', max_length=512, truncation=True, padding="max_length")

    # Generate the summary
    summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)

    # Decode the generated tokens into a readable string
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example usage
code_snippet = """
function filter(arr) {
    return arr.filter(num => num % 2 == 0);
}

const numbers = [1, 2, 3, 4, 5, 6];
console.log(filter(numbers));  
"""
summary = generate_summary(code_snippet)
print(summary)


Filter out odd numbers
@param {Array} arr
@returns {Array}


In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, RobertaTokenizer
import torch

# Load the trained model and tokenizer
model_path = '/data/T5/codeT52_54k'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def generate_summary(code_snippet):
    # Add a prompt to the input code snippet
    prompt = "Summarize the following JavaScript code: "
    input_text = prompt + code_snippet

    # Tokenize the input code snippet with the prompt
    inputs = tokenizer(input_text, return_tensors='pt', max_length=512, truncation=True, padding="max_length")

    # Generate the summary
    summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)

    # Decode the generated tokens into a readable string
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT-2 model and tokenizer
gpt2_model_name = "gpt2"
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)

# Add padding token to the tokenizer
gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the GPT-2 model after adding the special token
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)

# Function to generate extended summary using GPT-2
def extend_summary_with_gpt2(summary):
    # Encode the input summary
    inputs = gpt2_tokenizer.encode(summary, return_tensors='pt', padding=True, truncation=True)
    
    # Create the attention mask
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    
    # Generate the extended summary
    extended_summary_ids = gpt2_model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=gpt2_tokenizer.pad_token_id  # Use the tokenizer's pad token ID
    )
    
    # Decode the generated tokens into a readable string
    extended_summary = gpt2_tokenizer.decode(extended_summary_ids[0], skip_special_tokens=True)
    
    return extended_summary

# Example usage with the summary generated by CodeT5
code_snippet = """
function myfun(a,b){
    let res= a+b;
    return res;
}
console.log(myfun(8,2));
}
"""
initial_summary = generate_summary(code_snippet)
print("Initial Summary:", initial_summary)

extended_summary = extend_summary_with_gpt2(initial_summary)
print("Extended Summary:", extended_summary)

Initial Summary: Add two numbers
@param {number} a
@param {number} b
@returns {number}
Extended Summary: Add two numbers
@param {number} a
@param {number} b
@returns {number} The number of elements in the array.

Returns: The array containing the elements of the given number, or null if no element is found. If the number is greater than or equal to 0, the element will be removed from the list. Otherwise, it will not be added to the Array.prototype.array.removeAll(element, true). Note that this method does not return an array, so it is not recommended to use it to remove elements from arrays. This method is only useful if you want to add an element to a list and then remove it from that list by calling the removeAll method on it. For more information, see Remove All Elements from a List. Returns: A list of all elements that have been removed. The elements are sorted by the order in which they were removed, with the first element being the last element removed and the second element the 

In [6]:
pip install rouge_score 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [17]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration,RobertaTokenizer
import pandas as pd
from datasets import Dataset, load_metric
import evaluate
import sacrebleu
# Load your fine-tuned model and tokenizer
model_path = '/data/T5/codeT51_54k'  # Path where your model is saved
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your test dataset (replace this with actual loading code)
# test_df = pd.read_csv('path/to/your/test_data.csv')

# Sample 10 examples from the test dataset
sampled_test_df = test_df.sample(n=100, random_state=123)

# Convert to Hugging Face Dataset
sampled_test_data = Dataset.from_pandas(sampled_test_df)

# Generate predictions for the selected samples
def generate_predictions(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    references = []

    for example in dataset:
        input_text = example['code']
        reference_text = example['docstring']

        input_ids = tokenizer.encode(input_text, padding="max_length", truncation=True, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids, max_length=512, num_beams=64, early_stopping=True, length_penalty=10)
        
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(predicted_text)
        references.append(reference_text)

    return predictions, references

# Generate predictions
predictions, references = generate_predictions(model, tokenizer, sampled_test_data, device)

# Load metrics
rouge_metric = load_metric("rouge")
meteor_metric = evaluate.load("meteor")

# Compute ROUGE scores
rouge_results = rouge_metric.compute(predictions=predictions, references=references)

# Compute METEOR scores
meteor_results = meteor_metric.compute(predictions=predictions, references=references)

bleu_results=sacrebleu.corpus_bleu(predictions,[references])
# Print results
print("BLEU:",bleu_results.score)
print("ROUGE-1:", rouge_results['rouge1'].mid.fmeasure)
print("ROUGE-2:", rouge_results['rouge2'].mid.fmeasure)
print("ROUGE-L:", rouge_results['rougeL'].mid.fmeasure)
print("METEOR:", meteor_results['meteor'])


[nltk_data] Downloading package wordnet to /home/nm788186/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/nm788186/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nm788186/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


BLEU: 11.391771768619734
ROUGE-1: 0.3331085483726109
ROUGE-2: 0.12496958196753155
ROUGE-L: 0.29456832641993935
METEOR: 0.29657723348155335


In [None]:
pip install -q gradio

In [None]:
import gradio as gr
output_text=gr.Textbox()
demo = gr.Interface(fn=generate_summary,
                   inputs="textbox",
                   outputs=output_text,
                   title="Automatic Code Summarizer for Javascript",
                   description="This app can summarize your javascript code snippets in natural language")
demo.launch()