In [6]:
from transformers import BertTokenizer, BertModel
import os
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from transformers import get_linear_schedule_with_warmup
import gc
import numpy as np
import pandas as pd
import time


  from .autonotebook import tqdm as notebook_tqdm
2025-11-08 12:20:39.434883: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:

data_path = './data/VulnExtractData/ffmpeg_test'
train_path = data_path + '/Train'
test_path = data_path + '/Test'
train_path,test_path


('./data/VulnExtractData/ffmpeg_test/Train',
 './data/VulnExtractData/ffmpeg_test/Test')

In [4]:
task_list = ['AF','BF','CL']

In [5]:



def train_model_svm(X_train,Y_train,X_test,Y_test):
    param_grid = {
        'svc__C': [0.1, 1, 10, 100],
        'svc__kernel': ['linear', 'rbf', 'poly'],
        'svc__degree': [2, 3, 4],
        'svc__gamma': ['scale', 'auto']
    }
    

    pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,1)), SVC(probability=True))
    grid_search = GridSearchCV(pipeline,param_grid,verbose = 2,n_jobs = -1)

    grid_search.fit(X_train,Y_train)

    print(f"Best parameters: {grid_search.best_params_}")

    Y_pred = grid_search.best_estimator_.predict(X_test)

    print(classification_report(Y_test, Y_pred, digits=4))

def train_model_svm_with_features(X_train,Y_train,X_test,Y_test):
    param_grid = {
        'svc__C': [0.1, 1, 10, 100],
        'svc__kernel': ['linear', 'rbf', 'poly'],
        'svc__degree': [2, 3, 4],
        'svc__gamma': ['scale', 'auto']
    }

    preprocessor = ColumnTransformer([
        ('tfidf', TfidfVectorizer(), 'Description'),  # only your text column name here
        ('scaler', StandardScaler(), X_train.columns[1:])
    ], remainder='drop')

    pipeline = make_pipeline(preprocessor, SVC(probability=True))

    grid_search = GridSearchCV(pipeline,param_grid,verbose = 2,n_jobs = -1)

    grid_search.fit(X_train,Y_train)

    print(f"Best parameters: {grid_search.best_params_}")

    Y_pred = grid_search.best_estimator_.predict(X_test)

    print(classification_report(Y_test, Y_pred, digits=4))

 

In [None]:


def prepare_data(task_id,with_features = False):    
    folder_train_path = train_path + '/' + task_id

    all_train_files = []
    for root, dirs, files in os.walk(folder_train_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_train_files.append(full_path)

    dfs = [pd.read_csv(file) for file in all_train_files]

    folder_test_path = test_path + '/' + task_id

    all_test_files = []
    for root, dirs, files in os.walk(folder_test_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_test_files.append(full_path)

    dfs_train = [pd.read_csv(file) for file in all_train_files]

    dfs_test = [pd.read_csv(file) for file in all_test_files]

    df_train = pd.concat(dfs_train, ignore_index=True)

    df_test  = pd.concat(dfs_test, ignore_index=True)

    #use only Description for training
    X_train_descriptions = df_train['Description']
    X_test_descriptions = df_test['Description']

    if with_features == True:
        X_train_features = df_train.drop(columns=['Description', 'label', 'CVE_ID'])
        X_test_features = df_test.drop(columns=['Description', 'label', 'CVE_ID'])

        X_train = pd.concat([X_train_descriptions,X_train_features],axis=1)
        X_test = pd.concat([X_test_descriptions,X_test_features],axis=1)
    else:
        X_train = X_train_descriptions
        X_test = X_test_descriptions

    Y_train = df_train['label'].apply(lambda x:0 if x==4 else 1).values
    Y_test = df_test['label'].apply(lambda x:0 if x==4 else 1).values
   
    return X_train,Y_train,X_test,Y_test



for task_id in task_list:
    X_train,Y_train,X_test,Y_test = prepare_data(task_id, with_features=False)
    #train_model_svm(X_train,Y_train,X_test,Y_test)
    X_train,Y_train,X_test,Y_test = prepare_data(task_id, with_features=True)
    train_model_svm_with_features(X_train,Y_train,X_test,Y_test)



NameError: name 'train_model_svm_with_features' is not defined

In [None]:
class TextDataset(Dataset):
    def __init__(self,df,tokenizer,max_len = 512):
        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len
        self.texts = df['Description']
        self.labels =  df['label'].apply(lambda x:0 if x==4 else 1).values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,index):
        text = self.texts[index]
        labels = self.labels[index]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
            padding=False 
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.long)
        }
    

class TextFeatureDataset(Dataset):
    def __init__(self,df,tokenizer,max_len = 512):
        
        self.tokenizer = tokenizer
        self.df = df
        feature_cols = [col for col in df.columns if col not in ['Description', 'CVE_ID', 'label']]
        self.max_len = max_len
        self.features = df[feature_cols].astype(float)
        self.texts = df['Description']
        self.labels =  df['label'].apply(lambda x:0 if x==4 else 1).values

            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,index):
        text = self.texts[index]
        labels = self.labels[index]
        features = self.features.iloc[index].to_numpy(dtype=np.float32, copy=True)
        features = torch.from_numpy(features).contiguous() 
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
            padding=False 
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'features': features,
            'labels': torch.tensor(labels, dtype=torch.long)
        }    

class BertClassifierWithFeatures(nn.Module):
    def __init__(self,bert_model_name='bert-base-uncased',num_additional_features=16, num_classes=2):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name,num_labels=num_classes)
        self.classifier = nn.Linear(self.bert.config.hidden_size + num_additional_features, num_classes)
        
    def forward(self,input_ids,attention_mask,features):
        outputs = self.bert(input_ids,attention_mask = attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        combined = torch.cat((cls_embedding, features), dim=1)
        outputs = self.classifier(combined)
        return outputs

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    features = [item['features'] for item in batch]
    
  
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    features = torch.stack(features)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'features': features,
        'labels': labels
    }

def train_model(model,model_class ,train_loader,val_loader, device, optimizer, scheduler, epochs=3):
    model.to(device)
    losses=[],accuracies= []
    for epoch in range(epochs):
        model.train()
        n_correct = 0
        n_samples = 0
        total_loss = 0
        for i,batch in enumerate(train_loader):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            features = batch['features'].to(device)
        
            model.zero_grad()
            if model_class  ==  BertClassifierWithFeatures:
                outputs = model(input_ids = input_ids,attention_mask = attention_mask,features=features)
                loss = nn.CrossEntropyLoss()(outputs, labels)
                logits = torch.softmax(outputs,dim=1)
                preds = torch.argmax(logits, dim=1)
            else:
                outputs = model(input_ids,attention_mask = attention_mask,labels=labels)
                loss = outputs.loss
                preds = torch.argmax(outputs.logits)

            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

            n_samples += labels.shape[0]
            n_correct += (preds == labels).sum().item()
            
            if i % 20 == 0 :
                print(f'Epoch:{epoch} Step:{i+1}/{len(train_loader)} Loss:{loss.item()}')

        avg_loss = total_loss/len(train_loader)
        accuracy = n_correct/n_samples

        losses.append(avg_loss)
        accuracies.append(accuracy)
        losses.append(total_loss/len(train_loader))
            

        
def run_training(model_name,model_class,tokenizer,df_train,df_val,epochs = 3,learning_rate = 5e-5):
    gc.collect() 
    torch.cuda.empty_cache()      # releases cached GPU memory back to the driver

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_dataset = TextFeatureDataset(df_train,tokenizer)
    val_dataset = TextFeatureDataset(df_val,tokenizer)


    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn,num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn,num_workers=0)

    if model_class == BertClassifierWithFeatures:
        num_features = train_dataset.features.shape[1]  # <- infer!
    #   print(num_features)
        model = model_class(model_name,num_additional_features = num_features, num_classes=2)
        
    else: 
        model = model_class.from_pretrained(model_name, num_labels=2)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    loss_list,accuracy_list = train_model(model,
                                          model_class=model_class,
                                          train_loader=train_loader, 
                                          device=device,
                                          optimizer=optimizer, 
                                          scheduler=scheduler, 
                                          epochs=epochs
                                        )
    

    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()
    print("Training finished cleanly.")
    return model
    

def evaluate_model(model,model_class,df_train,tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    test_dataset = TextFeatureDataset(df_train,tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn,num_workers=0)

    n_correct = 0
    n_samples = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            features = batch['features'].to(device)
            
       
            if model_class  ==  BertClassifierWithFeatures:
                outputs = model(input_ids = input_ids,attention_mask = attention_mask,features=features)
                logits = torch.softmax(outputs,dim=1)
                loss = nn.CrossEntropyLoss()(outputs, labels)
                preds = torch.argmax(logits,dim = 1)
                
            else:
                outputs = model(input_ids,attention_mask = attention_mask)
                preds = torch.argmax(outputs.logits,dim = 1)
                loss = outputs.loss
                
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            n_samples += labels.shape[0]
            n_correct += (preds == labels).sum().item()
     
    print(100 * n_correct/ n_samples)
    print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive'], digits=4))
    return 100 * n_correct/ n_samples
    

    


In [None]:

model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)


def prepare_data(task_id):    
    folder_train_path = train_path + '/' + task_id

    all_train_files = []
    for root, dirs, files in os.walk(folder_train_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_train_files.append(full_path)

    dfs = [pd.read_csv(file) for file in all_train_files]

    folder_test_path = test_path + '/' + task_id

    all_test_files = []
    for root, dirs, files in os.walk(folder_test_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_test_files.append(full_path)

    dfs_train = [pd.read_csv(file) for file in all_train_files]

    dfs_test = [pd.read_csv(file) for file in all_test_files]

    df_train = pd.concat(dfs_train, ignore_index=True)

    df_test  = pd.concat(dfs_test, ignore_index=True)

    return df_train,df_test


models = {}
for task_id in task_list:

    df_train ,df_test = prepare_data(task_id)

    model_class = AutoModelForSequenceClassification
    model = run_training(model_name,model_class,tokenizer,df_train,df_test)

    evaluate_model(model=model,model_class=model_class,df_train=df_test,tokenizer=tokenizer)
    key = f'text_only_{task_id}'
    models[key] = model
    time.sleep(5)

    model_class = BertClassifierWithFeatures    

    model = run_training(model_name,model_class,tokenizer,df_train,df_test)
    evaluate_model(model=model,model_class=model_class,df_train=df_test,tokenizer=tokenizer)
    key = f'text_and_features_{task_id}'
    models[key] = model
    time.sleep(5)



    

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:0 Step:1/238 Loss:0.4110304117202759
Epoch:0 Step:21/238 Loss:0.269157350063324
Epoch:0 Step:41/238 Loss:0.25802895426750183
Epoch:0 Step:61/238 Loss:0.4497813880443573
Epoch:0 Step:81/238 Loss:0.1657324880361557
Epoch:0 Step:101/238 Loss:0.5277336239814758
Epoch:0 Step:121/238 Loss:0.6443719267845154
Epoch:0 Step:141/238 Loss:0.22751185297966003
Epoch:0 Step:161/238 Loss:0.09248163551092148
Epoch:0 Step:181/238 Loss:0.16462604701519012
Epoch:0 Step:201/238 Loss:0.2235037088394165
Epoch:0 Step:221/238 Loss:0.3200359642505646
Epoch:1 Step:1/238 Loss:0.04552358761429787
Epoch:1 Step:21/238 Loss:0.4247938096523285
Epoch:1 Step:41/238 Loss:0.9360978603363037
Epoch:1 Step:61/238 Loss:0.13442298769950867
Epoch:1 Step:81/238 Loss:0.012406101450324059
Epoch:1 Step:101/238 Loss:0.00820823572576046
Epoch:1 Step:121/238 Loss:0.03346443921327591
Epoch:1 Step:141/238 Loss:0.05825212597846985
Epoch:1 Step:161/238 Loss:0.3345009684562683
Epoch:1 Step:181/238 Loss:0.15711615979671478
Epoch:1 Ste

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:0 Step:1/238 Loss:0.5223184823989868
Epoch:0 Step:21/238 Loss:0.09298431873321533
Epoch:0 Step:41/238 Loss:0.035009901970624924
Epoch:0 Step:61/238 Loss:0.03562426567077637
Epoch:0 Step:81/238 Loss:0.42511019110679626
Epoch:0 Step:101/238 Loss:0.0465020090341568
Epoch:0 Step:121/238 Loss:0.49784302711486816
Epoch:0 Step:141/238 Loss:0.012174705974757671
Epoch:0 Step:161/238 Loss:0.010272837243974209
Epoch:0 Step:181/238 Loss:0.4228875935077667
Epoch:0 Step:201/238 Loss:0.49552032351493835
Epoch:0 Step:221/238 Loss:0.03526245057582855
Epoch:1 Step:1/238 Loss:0.4553293287754059
Epoch:1 Step:21/238 Loss:0.043228186666965485
Epoch:1 Step:41/238 Loss:0.5003255605697632
Epoch:1 Step:61/238 Loss:0.4311849772930145
Epoch:1 Step:81/238 Loss:0.014116243459284306
Epoch:1 Step:101/238 Loss:0.05658484250307083
Epoch:1 Step:121/238 Loss:0.024988247081637383
Epoch:1 Step:141/238 Loss:0.03547721728682518
Epoch:1 Step:161/238 Loss:0.009592103771865368
Epoch:1 Step:181/238 Loss:0.16934093832969666

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch:0 Step:1/238 Loss:0.5478124618530273
Epoch:0 Step:21/238 Loss:0.08670975267887115
Epoch:0 Step:41/238 Loss:0.06301571428775787
Epoch:0 Step:61/238 Loss:0.04263300448656082
Epoch:0 Step:81/238 Loss:0.513891339302063
Epoch:0 Step:101/238 Loss:0.02086064964532852
Epoch:0 Step:121/238 Loss:0.04829645901918411
Epoch:0 Step:141/238 Loss:0.029289541766047478
Epoch:0 Step:161/238 Loss:0.054860636591911316
Epoch:0 Step:181/238 Loss:0.04901476949453354
Epoch:0 Step:201/238 Loss:0.028043227270245552
Epoch:0 Step:221/238 Loss:0.6917788982391357
Epoch:1 Step:1/238 Loss:0.022634487599134445
Epoch:1 Step:21/238 Loss:0.0033662302885204554
Epoch:1 Step:41/238 Loss:0.046737924218177795
Epoch:1 Step:61/238 Loss:0.00989456009119749
Epoch:1 Step:81/238 Loss:0.006972828414291143
Epoch:1 Step:101/238 Loss:0.056075289845466614
Epoch:1 Step:121/238 Loss:0.005457428749650717
Epoch:1 Step:141/238 Loss:0.045359399169683456
Epoch:1 Step:161/238 Loss:0.06723704189062119
Epoch:1 Step:181/238 Loss:0.04320675134

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:0 Step:1/238 Loss:0.9107115864753723
Epoch:0 Step:21/238 Loss:0.7162814140319824
Epoch:0 Step:41/238 Loss:0.1513647735118866
Epoch:0 Step:61/238 Loss:0.4388476610183716
Epoch:0 Step:81/238 Loss:0.360135555267334
Epoch:0 Step:101/238 Loss:0.07532905787229538
Epoch:0 Step:121/238 Loss:0.02814149111509323
Epoch:0 Step:141/238 Loss:0.20087702572345734
Epoch:0 Step:161/238 Loss:0.4415345788002014
Epoch:0 Step:181/238 Loss:0.06737054139375687
Epoch:0 Step:201/238 Loss:0.09842225164175034
Epoch:0 Step:221/238 Loss:0.23344916105270386
Epoch:1 Step:1/238 Loss:0.14144939184188843
Epoch:1 Step:21/238 Loss:0.042725369334220886
Epoch:1 Step:41/238 Loss:0.1011171042919159
Epoch:1 Step:61/238 Loss:0.027470458298921585
Epoch:1 Step:81/238 Loss:0.030651791021227837
Epoch:1 Step:101/238 Loss:0.18801680207252502
Epoch:1 Step:121/238 Loss:0.029772497713565826
Epoch:1 Step:141/238 Loss:0.03129936754703522
Epoch:1 Step:161/238 Loss:0.011643197387456894
Epoch:1 Step:181/238 Loss:0.031049925833940506
Ep

AttributeError: 'BertForSequenceClassification' object has no attribute 'save'

In [24]:
for name,model in models.items():
    path = f"./saved_models/{name}"
    os.makedirs(path, exist_ok=True)

    if hasattr(model, "save_pretrained"):  # Hugging Face model
        model.save_pretrained(path)
    else:  # Custom torch.nn model
        torch.save(model.state_dict(), os.path.join(path, "model.pt"))