In [1]:
pip install jupyter ipywidgets transformers torch pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tokenizers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaModel, LongformerTokenizer, LongformerModel, T5EncoderModel
from tqdm import tqdm

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

df = pd.read_csv("combined_dataset_cleaned.csv")

tokenizer_codebert = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
model_codebert = RobertaModel.from_pretrained('microsoft/codebert-base').to(device)

tokenizer_long = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_long = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)

tokenizer_T5 = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model_T5 = T5EncoderModel.from_pretrained('Salesforce/codet5-base').to(device)

Using device: mps


In [None]:
def print_token_stats(tokenizer, name):
    token_lengths = [len(tokenizer.tokenize(str(code))) for code in df['code']]
    print(f"\n{name} Tokenizer Stats:")
    print(f"Maximum token number: {max(token_lengths)}")
    print(f"Average token number: {sum(token_lengths) / len(token_lengths)}")
    if name == "Longformer":
        print(f"Number of codes > 4094: {sum(1 for x in token_lengths if x > 4094)}")
    else:
        print(f"Number of codes > 510: {sum(1 for x in token_lengths if x > 510)}")

print_token_stats(tokenizer_codebert, "CodeBERT")
print_token_stats(tokenizer_long, "Longformer")
print_token_stats(tokenizer_T5, "CodeT5")


CodeBERT Tokenizer Stats:
Maximum token number: 30019
Average token number: 2128.625558867362
Number of codes > 510: 4750

Longformer Tokenizer Stats:
Maximum token number: 30019
Average token number: 2128.625558867362
Number of codes > 4094: 574

CodeT5 Tokenizer Stats:
Maximum token number: 29100
Average token number: 2134.971125186289
Number of codes > 510: 4742


In [None]:
def extract_codebert_features(code_snippet, max_length=512):
    inputs = tokenizer_codebert(code_snippet, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to(device)
    model_codebert.eval()
    with torch.no_grad():
        outputs = model_codebert(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_codebert_features_long_code(code_snippet, max_length=512, stride=1024):
    tokens = tokenizer_codebert.tokenize(code_snippet)
    if len(tokens) <= max_length - 2:
        return extract_codebert_features(code_snippet, max_length)
    features = []
    for i in range(0, len(tokens), stride):
        segment = tokens[i:i + max_length - 2]
        if len(segment) > 0:
            segment_code = tokenizer_codebert.convert_tokens_to_string(segment)
            feat = extract_codebert_features(segment_code, max_length)
            features.append(feat)
    return np.max(features, axis=0) if features else np.zeros(768)

def extract_longformer_features(code_snippet, max_length=4096):
    inputs = tokenizer_long(code_snippet, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to(device)
    model_long.eval()
    with torch.no_grad():
        outputs = model_long(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_longformer_features_long_code(code_snippet, max_length=4096, stride=1024):
    tokens = tokenizer_long.tokenize(code_snippet)
    if len(tokens) <= max_length - 2:
        return extract_longformer_features(code_snippet, max_length)
    features = []
    for i in range(0, len(tokens), stride):
        segment = tokens[i:i + max_length - 2]
        if len(segment) > 0:
            segment_code = tokenizer_long.convert_tokens_to_string(segment)
            feat = extract_longformer_features(segment_code, max_length)
            features.append(feat)
    return np.max(features, axis=0) if features else np.zeros(768)

def extract_codet5_features(code_snippet, max_length=512):
    inputs = tokenizer_T5(code_snippet, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length').to(device)
    model_T5.eval()
    with torch.no_grad():
        outputs = model_T5(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_codet5_features_long_code(code_snippet, max_length=512, stride=256):
    tokens = tokenizer_T5.tokenize(code_snippet)
    if len(tokens) <= max_length - 1:
        return extract_codet5_features(code_snippet, max_length)
    features = []
    for i in range(0, len(tokens), stride):
        segment = tokens[i:i + max_length - 1]
        if len(segment) > 0:
            segment_code = tokenizer_T5.convert_tokens_to_string(segment)
            feat = extract_codet5_features(segment_code, max_length)
            features.append(feat)
    return np.max(features, axis=0) if features else np.zeros(768)

In [None]:
def extract_features(code, model_type):
    try:
        if model_type == "codebert":
            return extract_codebert_features_long_code(str(code))
        elif model_type == "longformer":
            return extract_longformer_features_long_code(str(code))
        elif model_type == "codet5":
            return extract_codet5_features_long_code(str(code))
    except Exception as e:
        print(f"Error processing {model_type} for code snippet: {e}")
        return np.zeros(768)

features_list_codebert = []
features_list_long = []
features_list_T5 = []

for code in tqdm(df['code'], desc="Extracting features"):
    features_list_codebert.append(extract_features(code, "codebert"))
    features_list_long.append(extract_features(code, "longformer"))
    features_list_T5.append(extract_features(code, "codet5"))

Extracting features: 100%|████████████████| 5368/5368 [2:39:21<00:00,  1.78s/it]


In [10]:
new_df = pd.DataFrame({
    'vulnerability_exists': df['vulnerability_exists'],
    'vulnerability_list': df['vulnerability_list'],
    'features_list_codebert': features_list_codebert,
    'features_list_long': features_list_long,
    'features_list_T5': features_list_T5
})

new_df.to_csv('dataset_with_features_array.csv', index=False)
print("Feature extraction completed, saved to 'dataset_with_features_array.csv'")

Feature extraction completed, saved to 'dataset_with_features_array.csv'
