In [None]:
pip install jupyter ipywidgets transformers torch pandas

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.3.6-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)

In [None]:
pip install tokenizers



In [None]:
import pandas as pd
import torch
import numpy as np
import os
from transformers import RobertaTokenizer, RobertaModel, LongformerTokenizer, LongformerModel, T5EncoderModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

df = pd.read_csv("combined_dataset_cleaned.csv")

tokenizer_codebert = RobertaTokenizer.from_pretrained('microsoft/codebert-base', do_lower_case=False)
model_codebert = RobertaModel.from_pretrained('microsoft/codebert-base').to(device)

tokenizer_long = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_long = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)

tokenizer_T5 = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model_T5 = T5EncoderModel.from_pretrained('Salesforce/codet5-base').to(device)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

# 新段落

In [None]:
def print_token_stats(tokenizer, name):
    token_lengths = [len(tokenizer.tokenize(str(code))) for code in df['code']]
    print(f"\n{name} Tokenizer Stats:")
    print(f"Maximum token number: {max(token_lengths)}")
    print(f"Average token number: {sum(token_lengths) / len(token_lengths)}")
    if name=="Longformer":
        print(f"Number of codes > 4094: {sum(1 for x in token_lengths if x > 4094)}")
    else:
        print(f"Number of codes > 510: {sum(1 for x in token_lengths if x > 510)}")

print_token_stats(tokenizer_codebert, "CodeBERT")
print_token_stats(tokenizer_long, "Longformer")
print_token_stats(tokenizer_T5, "CodeT5")


CodeBERT Tokenizer Stats:
Maximum token number: 30019
Average token number: 2128.625558867362
Number of codes > 510: 4750

Longformer Tokenizer Stats:
Maximum token number: 30019
Average token number: 2128.625558867362
Number of codes > 4094: 574

CodeT5 Tokenizer Stats:
Maximum token number: 29100
Average token number: 2134.971125186289
Number of codes > 510: 4742


In [None]:
def extract_codebert_features(code_snippet, max_length=512):
    inputs = tokenizer_codebert(code_snippet, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to(device)
    model_codebert.eval()
    with torch.no_grad():
        outputs = model_codebert(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_codebert_features_long_code(code_snippet, max_length=512, stride=256):
    encoding = tokenizer_codebert(
        code_snippet,
        return_tensors='pt',
        truncation=False,
        add_special_tokens=False
    )

    all_input_ids = encoding['input_ids'][0].split(max_length - 2)
    features = []

    for window in all_input_ids:
        window_input = {
            'input_ids': window.unsqueeze(0).to(device),
            'attention_mask': torch.ones_like(window).unsqueeze(0).to(device)
        }

        with torch.no_grad():
            outputs = model_codebert(**window_input)

        features.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())

    return np.max(features, axis=0) if features else np.zeros(768)

In [None]:
def extract_longformer_features(code_snippet, max_length=4096):
    inputs = tokenizer_long(
        code_snippet,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length'
    ).to(device)
    model_long.eval()
    with torch.no_grad():
        outputs = model_long(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_longformer_features_long_code(code_snippet, max_length=4096, stride=1024):
    tokens = tokenizer_long.tokenize(code_snippet)
    if len(tokens) <= max_length - 2:
        return extract_longformer_features(code_snippet, max_length)

    features = []
    for i in range(0, len(tokens), stride):
        segment = tokens[i:i + max_length - 2]
        if len(segment) > 0:
            segment_code = tokenizer_long.convert_tokens_to_string(segment)
            feat = extract_longformer_features(segment_code, max_length)
            features.append(feat)
    return np.max(features, axis=0) if features else np.zeros(768)

In [None]:
def extract_codet5_features(code_snippet, max_length=512):
    inputs = tokenizer_T5(
        code_snippet,
        return_tensors='pt',
        max_length=max_length,
        truncation=True,
        padding='max_length'
    ).to(device)

    model_T5.eval()

    with torch.no_grad():
        outputs = model_T5(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

    features = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return features

def extract_codet5_features_long_code(code_snippet, max_length=512, stride=256):
    tokens = tokenizer_T5.tokenize(code_snippet)
    if len(tokens) <= max_length - 1:  
        return extract_codet5_features(code_snippet, max_length)

    features = []
    for i in range(0, len(tokens), stride):
        segment = tokens[i:i + max_length - 1]
        if len(segment) > 0:
            segment_code = tokenizer_T5.convert_tokens_to_string(segment)
            feat = extract_codet5_features(segment_code, max_length)
            features.append(feat)

    return np.max(features, axis=0) if features else np.zeros(768)

In [None]:
from torch.utils.data import Dataset, DataLoader

class CodeDataset(Dataset):
    def __init__(self, codes):
        self.codes = codes

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        return str(self.codes[idx])

batch_size = 20  
dataset = CodeDataset(df['code'])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

def batch_extract_features(batch_codes, model_type):
    if model_type == "codebert":
        return [extract_codebert_features_long_code(code) for code in batch_codes]
    elif model_type == "longformer":
        return [extract_longformer_features_long_code(code) for code in batch_codes]
    elif model_type == "codet5":
        return [extract_codet5_features_long_code(code) for code in batch_codes]

features_list_codebert = []
features_list_long = []
features_list_T5 = []

for batch in tqdm(dataloader, desc="Processing batches"):
    features_list_codebert.extend(batch_extract_features(batch, "codebert"))
    features_list_long.extend(batch_extract_features(batch, "longformer"))
    features_list_T5.extend(batch_extract_features(batch, "codet5"))

Processing batches:   0%|          | 0/269 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 512). Running this sequence through the model will result in indexing errors
Processing batches:  96%|█████████▋| 259/269 [1:41:10<04:52, 29.28s/it]

In [None]:
new_df = pd.DataFrame({
    'vulnerability_exists': df['vulnerability_exists'],
    'vulnerability_list': df['vulnerability_list'],
    'features_list_codebert': features_list_codebert,
    'features_list_long': features_list_long,
    'features_list_T5': features_list_T5
})

new_df.to_csv('dataset_with_features_array.csv', index=False)
print("Feature extraction completed, saved to 'dataset_with_features_array.csv'")