# Inference and Evaluation of Fine-tuned CodeT5 for Java Code Summarization

**INSTALL LIBRARIES**
----------------------
----------------------
----------------------
----------------------

In [1]:

!pip install transformers datasets evaluate rouge_score bert_score --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m119.3 MB/s[0m eta [36

**MOUNT DRIVE**
----------------------
----------------------
----------------------
----------------------

In [3]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**LOAD HF DATASET**
----------------------
----------------------
----------------------
----------------------

In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

dataset = load_dataset("code_x_glue_ct_code_to_text", "java")

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/164923 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10955 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

**INFERENCE**
----------------------
----------------------
----------------------
----------------------

In [2]:
# CodeSummaryGenerator
# This class handles preprocessing, batch inference, and single-example generation for code summarization
# using a fine-tuned CodeT5 model from Hugging Face.

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from typing import List, Dict, Optional
from tqdm import tqdm
import os


class CodeSummaryGenerator:
    def __init__(self, model_path: str, decoding_config: Dict, device: Optional[str] = None):
        # Load tokenizer and model from Hugging Face
        self.model_path = model_path
        self.decoding_config = decoding_config
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device)
        self.model.eval()

    def preprocess_dataset(self, split: str = "validation", max_input_length: int = 512):
        # Load and tokenize dataset split (validation or test)
        dataset = load_dataset("code_x_glue_ct_code_to_text", "java")[split]

        def tokenize_fn(example):
            return self.tokenizer(
                example["code"],
                truncation=True,
                padding="max_length",
                max_length=max_input_length,
            )

        tokenized = dataset.map(tokenize_fn, batched=True)
        tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
        return tokenized, dataset["docstring"]

    def generate_summaries(self,
                            tokenized_data,
                            references: List[str],
                            save_path: str,
                            batch_size: int = 16,
                            save_every: int = 50):
        # Perform batch inference on the dataset and periodically save output CSV

        val_loader = DataLoader(tokenized_data, batch_size=batch_size)
        generated_summaries = []
        start_batch = 0

        # Resume if already saved the file with partial results
        if os.path.exists(save_path):
            df_existing = pd.read_csv(save_path)
            generated_summaries = df_existing["predicted_summary"].astype(str).tolist()
            start_batch = len(generated_summaries) // batch_size
            print(f"⏩ Resuming from batch {start_batch} (already {len(generated_summaries)} predictions)")

        for i, batch in enumerate(tqdm(val_loader, desc="Generating Summaries")):
            if i < start_batch:
                continue

            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    **self.decoding_config
                )

            decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            generated_summaries.extend(decoded)

            # Periodically save results to disk
            if (i + 1) % save_every == 0 or (i + 1) == len(val_loader):
                print(f"💾 Saving at batch {i + 1}")
                df = pd.DataFrame({
                    "gold_summary": references[:len(generated_summaries)],
                    "predicted_summary": generated_summaries
                })
                df.to_csv(save_path, index=False)

    def generate_single(self, code_snippet: str):
        # Generate a summary for a single code snippet
        inputs = self.tokenizer(
            code_snippet,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        with torch.no_grad():
            output = self.model.generate(**inputs, **self.decoding_config)

        return self.tokenizer.decode(output[0], skip_special_tokens=True)

**EVALUATION**
----------------------
----------------------
----------------------
----------------------

In [3]:
import evaluate
import numpy as np
from collections import Counter

class SummaryEvaluator:
    def __init__(self):
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")
        self.bertscore = evaluate.load("bertscore")

    def avg_token_repetition(self, predictions):
        """
        Computes the average number of repeated tokens per prediction.
        A high repetition score indicates redundancy in the generated text.
        """
        rep_counts = []
        for text in predictions:
            tokens = text.strip().split()
            counts = Counter(tokens)
            repeated_tokens = sum(v for v in counts.values() if v > 1)
            rep_counts.append(repeated_tokens / max(1, len(tokens)))
        return np.mean(rep_counts)

    def evaluate_csvs(self, files: Dict[str, str]):
        """
        Evaluates multiple prediction files and returns a DataFrame of metrics.
        Each file must be a CSV with columns: 'predicted_summary' and 'gold_summary'.
        """
        all_results = []
        for name, path in files.items():
            if not os.path.exists(path):
                print(f"File not found: {path}")
                continue

            df = pd.read_csv(path)
            predictions = df["predicted_summary"].astype(str).tolist()
            references = df["gold_summary"].astype(str).tolist()

            rouge_scores = self.rouge.compute(predictions=predictions, references=references, use_stemmer=True)
            bleu_score = self.bleu.compute(predictions=predictions, references=references)
            bert_score = self.bertscore.compute(predictions=predictions, references=references, lang="en", device="cuda" if torch.cuda.is_available() else "cpu")
            repetition = self.avg_token_repetition(predictions)

            all_results.append({
                "Version": name,
                "ROUGE-1": round(rouge_scores["rouge1"], 4),
                "ROUGE-2": round(rouge_scores["rouge2"], 4),
                "ROUGE-L": round(rouge_scores["rougeL"], 4),
                "BLEU": round(bleu_score["bleu"], 4),
                "BERTScore": round(np.mean(bert_score["f1"]), 4),
                "Avg Token Repetition": round(repetition, 4)
            })

        return pd.DataFrame(all_results)


**INFERENCE & EVALUATION**
----------------------
----------------------
----------------------
----------------------

In [8]:
# Inference and Evaluation Pipeline for CodeT5
# This script runs multiple decoding strategies on the validation and test sets
# and generates a metrics comparison table with ROUGE, BLEU, BERTScore, Exact Match, and Repetition analysis.

from inference_pipeline import CodeSummaryGenerator
from eval_pipeline import SummaryEvaluator

if __name__ == "__main__":
    model_path = "pritammane105/CodeT5-Java-Summarisation"

    # Define decoding strategies
    decoding_configs = {
        "baseline_beam": {
            "max_new_tokens": 64,
            "num_beams": 4,
            "early_stopping": True
        },
        "beam_repetition": {
            "max_new_tokens": 64,
            "num_beams": 4,
            "early_stopping": True,
            "repetition_penalty": 1.3
        },
        "sampling_topk": {
            "max_new_tokens": 64,
            "do_sample": True,
            "top_k": 50,
            "temperature": 0.7,
            "repetition_penalty": 1.2
        }
    }

    # Output files (local/Colab environment)
    output_files = {
        "baseline_beam": "codet5_val_predictions.csv",
        "beam_repetition": "codet5_val_beam_repetition.csv",
        "sampling_temp": "codet5_val_sampling_temp.csv",
        "test_sampling_final": "codet5_test_sampling_output.csv"
    }

    # Run decoding strategies for validation set
    for name, config in decoding_configs.items():
        print(f"\n Running config: {name}")
        generator = CodeSummaryGenerator(model_path, config)
        val_tokenized, val_refs = generator.preprocess_dataset("validation")
        generator.generate_summaries(val_tokenized, val_refs, output_files[name])

    # Run final inference for test set using best config
    print("\n Generating final test set outputs with sampling_topk")
    generator = CodeSummaryGenerator(model_path, decoding_configs["sampling_topk"])
    test_tokenized, test_refs = generator.preprocess_dataset("test")
    generator.generate_summaries(test_tokenized, test_refs, output_files["test_sampling_final"])

    # Evaluate all outputs and compare metrics
    print("\n Evaluating generated summaries...")
    evaluator = SummaryEvaluator()
    results_df = evaluator.evaluate_csvs(output_files)
    display(results_df)


Generating Summaries:   7%|▋         | 50/685 [01:38<20:29,  1.94s/it]

💾 Saving at batch 50


Generating Summaries:  15%|█▍        | 100/685 [03:17<19:59,  2.05s/it]

💾 Saving at batch 100


Generating Summaries:  22%|██▏       | 150/685 [04:55<17:14,  1.93s/it]

💾 Saving at batch 150


Generating Summaries:  29%|██▉       | 200/685 [06:32<16:09,  2.00s/it]

💾 Saving at batch 200


Generating Summaries:  36%|███▋      | 250/685 [08:10<13:52,  1.91s/it]

💾 Saving at batch 250


Generating Summaries:  44%|████▍     | 300/685 [09:47<12:25,  1.94s/it]

💾 Saving at batch 300


Generating Summaries:  51%|█████     | 350/685 [11:24<10:47,  1.93s/it]

💾 Saving at batch 350


Generating Summaries:  58%|█████▊    | 400/685 [12:58<08:49,  1.86s/it]

💾 Saving at batch 400


Generating Summaries:  66%|██████▌   | 450/685 [14:35<07:58,  2.04s/it]

💾 Saving at batch 450


Generating Summaries:  73%|███████▎  | 500/685 [16:11<05:59,  1.94s/it]

💾 Saving at batch 500


Generating Summaries:  80%|████████  | 550/685 [17:49<04:36,  2.05s/it]

💾 Saving at batch 550


Generating Summaries:  88%|████████▊ | 600/685 [19:25<02:43,  1.92s/it]

💾 Saving at batch 600


Generating Summaries:  95%|█████████▍| 649/685 [20:56<01:08,  1.91s/it]

💾 Saving at batch 650


Generating Summaries: 100%|██████████| 685/685 [22:05<00:00,  1.94s/it]

💾 Saving at batch 685





Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Version,ROUGE-1,ROUGE-2,ROUGE-L,BLEU,BERTScore,Avg Token Repetition
0,baseline_beam,0.4091,0.2243,0.3648,0.1415,0.8667,0.5815
1,beam_repetition,0.4675,0.2556,0.4203,0.1643,0.884,0.4734
2,sampling_temp,0.4059,0.1746,0.3347,0.1224,0.8723,0.2246
3,test_sampling_final,0.3932,0.1647,0.3275,0.1275,0.871,0.2154


In [4]:
if __name__ == "__main__":
    model_path = "pritammane105/Custom-Java-Summarisation"

    # Evaluation
    evaluator = SummaryEvaluator()
    files = {
        "custom_topk_val": "/content/drive/MyDrive/custom_val_topk_predictions.csv",
        "custom_topk_text": "/content/drive/MyDrive/custom_test_topk_predictions.csv"
    }
    results_df = evaluator.evaluate_csvs(files)
    display(results_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Version,ROUGE-1,ROUGE-2,ROUGE-L,BLEU,BERTScore,Avg Token Repetition
0,custom_topk_val,0.0473,0.0011,0.0417,0.0013,0.7776,0.6539
1,custom_topk_text,0.0465,0.0011,0.0405,0.0014,0.7785,0.6535


**Miscellaneous Test Samples**
----------------------
----------------------
----------------------
----------------------

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load fine-tuned model from Google Drive
model_dir = "/content/drive/MyDrive/codet5_checkpoints/checkpoint-46386"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

def generate_summary(code_snippet):
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    outputs = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

java_code = """
public int add(int a, int b) {
    return a + b;
}
"""
print(generate_summary(java_code))


Add two integers.

@param a first integer
@param b second integer
@return the sum of the two integers


In [None]:
java_code = """
public int perform(int a, int b) {
    return a - b;
}
"""
print(generate_summary(java_code))

Computes the difference between two integers.

@param a first integer
@param b second integer
@return the difference


In [None]:
java_code = """
public int add(int a, int b) {
    return a - b;
}
"""
print(generate_summary(java_code))

Add two integers.

@param a the first integer
@param b the second integer
@return the result


In [None]:
test_example = dataset["train"][0]
test_code = test_example["code"]
print("ORIGINAL DOCSTRING")
print(test_example["docstring"])
print("\nTESTING CODE:")
print(test_code)

generated_summary = generate_summary(test_code)
print("\nGENERATED SUMMARY:")
print(generated_summary)

ORIGINAL DOCSTRING
Compare the supplied plaintext password to a hashed password.

@param   passwd  Plaintext password.
@param   hashed  scrypt hashed password.

@return true if passwd matches hashed value.

TESTING CODE:
public static boolean check(String passwd, String hashed) {
        try {
            String[] parts = hashed.split("\\$");

            if (parts.length != 5 || !parts[1].equals("s0")) {
                throw new IllegalArgumentException("Invalid hashed value");
            }

            long params = Long.parseLong(parts[2], 16);
            byte[] salt = decode(parts[3].toCharArray());
            byte[] derived0 = decode(parts[4].toCharArray());

            int N = (int) Math.pow(2, params >> 16 & 0xffff);
            int r = (int) params >> 8 & 0xff;
            int p = (int) params      & 0xff;

            byte[] derived1 = SCrypt.scrypt(passwd.getBytes("UTF-8"), salt, N, r, p, 32);

            if (derived0.length != derived1.length) return false;

          

In [None]:
test_example = dataset["test"][0]
test_code = test_example["code"]
print("ORIGINAL DOCSTRING")
print(test_example["docstring"])
print("\nTESTING CODE:")
print(test_code)

generated_summary = generate_summary(test_code)
print("\nGENERATED SUMMARY:")
print(generated_summary)

ORIGINAL DOCSTRING
Makes sure the fast-path emits in order.
@param value the value to emit or queue up
@param delayError if true, errors are delayed until the source has terminated
@param disposable the resource to dispose if the drain terminates

TESTING CODE:
protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {
        final Observer<? super V> observer = downstream;
        final SimplePlainQueue<U> q = queue;

        if (wip.get() == 0 && wip.compareAndSet(0, 1)) {
            if (q.isEmpty()) {
                accept(observer, value);
                if (leave(-1) == 0) {
                    return;
                }
            } else {
                q.offer(value);
            }
        } else {
            q.offer(value);
            if (!enter()) {
                return;
            }
        }
        QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);
    }

GENERATED SUMMARY:
Fast path emit.

@param value the 

In [None]:
test_example = dataset["test"][10]
test_code = test_example["code"]
print("ORIGINAL DOCSTRING")
print(test_example["docstring"])
print("\nTESTING CODE:")
print(test_code)

generated_summary = generate_summary(test_code)
print("\nGENERATED SUMMARY:")
print(generated_summary)

ORIGINAL DOCSTRING
Wraps a SingleSource into a Maybe.

<dl>
<dt><b>Scheduler:</b></dt>
<dd>{@code fromSingle} does not operate by default on a particular {@link Scheduler}.</dd>
</dl>
@param <T> the target type
@param singleSource the SingleSource to convert from
@return the new Maybe instance
@throws NullPointerException if single is null

TESTING CODE:
@CheckReturnValue
    @NonNull
    @SchedulerSupport(SchedulerSupport.NONE)
    public static <T> Maybe<T> fromSingle(SingleSource<T> singleSource) {
        ObjectHelper.requireNonNull(singleSource, "singleSource is null");
        return RxJavaPlugins.onAssembly(new MaybeFromSingle<T>(singleSource));
    }

GENERATED SUMMARY:
Construct a Maybe from a SingleSource

<pre>
{@code

import static com.oath.cyclops.reactor.RxJavaPlugins.fromSingle(new SingleSource() {
public void run(Object a) {
if (a != null) {
System.out.println(a);
} else {
System.out.println(a);
}
}
}
}
</pre>


@param singleSource a SingleSource
@param <T> the type of 

In [None]:
# 🧪 Analysis Script
import pandas as pd
import evaluate
import numpy as np
from datasets import Dataset
from collections import Counter
import os
import torch

# File paths for each decoding strategy
files = {
    "baseline_beam": "/content/drive/MyDrive/codet5_val_predictions.csv",
    "beam_repetition": "/content/drive/MyDrive/codet5_val_beam_repetition.csv",
    "sampling_temp": "/content/drive/MyDrive/codet5_val_sampling_temp.csv"
}

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

# Function to compute average token repetition per summary
def avg_token_repetition(predictions):
    rep_counts = []
    for text in predictions:
        tokens = text.strip().split()
        counts = Counter(tokens)
        repeated_tokens = sum(v for v in counts.values() if v > 1)
        rep_counts.append(repeated_tokens / max(1, len(tokens)))
    return np.mean(rep_counts)

# Run metrics for each version
all_results = []

for name, path in files.items():
    if not os.path.exists(path):
        print(f" File not found: {path}")
        continue

    df = pd.read_csv(path)
    predictions = df["predicted_summary"].astype(str).tolist()
    references = df["gold_summary"].astype(str).tolist()

    # Compute metrics
    rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bleu_score = bleu.compute(
        predictions=predictions,
        references=references
    )
    bert_score = bertscore.compute(predictions=predictions, references=references, lang="en", device="cuda")
    exact_match = sum(p.strip() == r.strip() for p, r in zip(predictions, references)) / len(references)
    repetition = avg_token_repetition(predictions)

    all_results.append({
        "Version": name,
        "ROUGE-1": round(rouge_scores["rouge1"], 4),
        "ROUGE-2": round(rouge_scores["rouge2"], 4),
        "ROUGE-L": round(rouge_scores["rougeL"], 4),
        "BLEU": round(bleu_score["bleu"], 4),
        "BERTScore": round(np.mean(bert_score["f1"]), 4),
        "Exact Match": round(exact_match, 4),
        "Avg Token Repetition": round(repetition, 4)
    })

# Display comparison table
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values("ROUGE-L", ascending=False)
display(results_df)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Version,ROUGE-1,ROUGE-2,ROUGE-L,BLEU,BERTScore,Exact Match,Avg Token Repetition
1,beam_repetition,0.4677,0.2558,0.4205,0.1643,0.884,0.0068,0.4734
0,baseline_beam,0.4094,0.2245,0.3648,0.1415,0.8667,0.0058,0.5815
2,sampling_temp,0.4059,0.1746,0.3348,0.1224,0.8723,0.0014,0.2246


In [None]:
# SCRIPT 2: Sampling with Temperature + Top-k
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import pandas as pd
import os

# Configs
model_dir = "/content/drive/MyDrive/codet5_checkpoints/checkpoint-46386"
save_path = "/content/drive/MyDrive/codet5_val_sampling_temp.csv"
save_every = 50

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

dataset = load_dataset("code_x_glue_ct_code_to_text", "java")
val_data = dataset["validation"]
gold_summaries = val_data["docstring"]

def tokenize_fn(example):
    return tokenizer(example["code"], truncation=True, padding="max_length", max_length=512)

val_tokenized = val_data.map(tokenize_fn, batched=True)
val_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
val_loader = DataLoader(val_tokenized, batch_size=16)

generated_summaries = []
start_batch = 0

if os.path.exists(save_path):
    df_existing = pd.read_csv(save_path)
    generated_summaries = df_existing["predicted_summary"].tolist()
    start_batch = len(generated_summaries) // 16
    print(f"Resuming from batch {start_batch} (already {len(generated_summaries)} predictions)")

for i, batch in enumerate(tqdm(val_loader, desc="Generating (Sampling + Temp)")):
    if i < start_batch:
        continue

    input_ids = batch["input_ids"].to(model.device)
    attention_mask = batch["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            top_k=50,
            temperature=0.7,
            max_new_tokens=64,
            repetition_penalty=1.2,
            early_stopping=True
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_summaries.extend(decoded)

    if (i + 1) % save_every == 0 or (i + 1) == len(val_loader):
        print(f"Saving at batch {i + 1}")
        df = pd.DataFrame({
            "gold_summary": gold_summaries[:len(generated_summaries)],
            "predicted_summary": generated_summaries
        })
        df.to_csv(save_path, index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/164923 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10955 [00:00<?, ? examples/s]

Map:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating (Sampling + Temp):  15%|█▌        | 50/324 [01:31<08:20,  1.83s/it]

Saving at batch 50


Generating (Sampling + Temp):  31%|███       | 100/324 [03:04<07:17,  1.96s/it]

Saving at batch 100


Generating (Sampling + Temp):  46%|████▋     | 150/324 [04:38<05:31,  1.90s/it]

Saving at batch 150


Generating (Sampling + Temp):  62%|██████▏   | 200/324 [06:12<03:41,  1.79s/it]

Saving at batch 200


Generating (Sampling + Temp):  77%|███████▋  | 250/324 [07:44<02:21,  1.91s/it]

Saving at batch 250


Generating (Sampling + Temp):  93%|█████████▎| 300/324 [09:19<00:48,  2.01s/it]

Saving at batch 300


Generating (Sampling + Temp): 100%|██████████| 324/324 [10:03<00:00,  1.86s/it]

Saving at batch 324



