In [17]:
data_path = './data/VulnExtractData/ffmpeg_test'
train_path = data_path + '/Train'
test_path = data_path + '/Test'
train_path,test_path


('./data/VulnExtractData/ffmpeg_test/Train',
 './data/VulnExtractData/ffmpeg_test/Test')

In [18]:
task_list = ['AF','BF','CL']

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


def train_model_svm(X_train,Y_train,X_test,Y_test):
    pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,1)), SVC(probability=True))

    pipeline.fit(X_train,Y_train)

    Y_pred = pipeline.predict(X_test)
    print(pipeline.score(X_test,Y_test))
    #print(classification_report(Y_test, Y_pred, digits=4))

    # Calculate confusion matrix for the binary classification
    cm = confusion_matrix(Y_test, Y_pred)

    # Extracting TP, FP, TN, FN from the confusion matrix
    TP = cm[1, 1]  # True Positives: positive class correctly identified
    FN = cm[1, 0]  # False Negatives: positive class incorrectly marked as negative
    FP = cm[0, 1]  # False Positives: negative class incorrectly marked as positive
    TN = cm[0, 0]  # True Negatives: negative class correctly identified

    # Print TP, FP, TN, FN
    #print(f"True Positives: {TP}")
    #print(f"False Positives: {FP}")
    #print(f"True Negatives: {TN}")
    #print(f"False Negatives: {FN}")

In [None]:
import os
import pandas as pd

def prepare_data(task_id):    
    folder_train_path = train_path + '/' + task_id

    all_train_files = []
    for root, dirs, files in os.walk(folder_train_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_train_files.append(full_path)

    dfs = [pd.read_csv(file) for file in all_train_files]

    folder_test_path = test_path + '/' + task_id

    all_test_files = []
    for root, dirs, files in os.walk(folder_test_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_test_files.append(full_path)

    dfs_train = [pd.read_csv(file) for file in all_train_files]

    dfs_test = [pd.read_csv(file) for file in all_test_files]

    df_train = pd.concat(dfs_train, ignore_index=True)

    df_test  = pd.concat(dfs_test, ignore_index=True)

    #use only Description for training
    X_train = df_train['Description']
    X_test = df_test['Description']

    Y_train = df_train['label'].apply(lambda x:0 if x==4 else 1).values
    Y_test = df_test['label'].apply(lambda x:0 if x==4 else 1).values

    X_train.shape,X_test.shape,Y_train.shape,Y_test.shape
    return X_train,Y_train,X_test,Y_test

for task_id in task_list:
    X_train,Y_train,X_test,Y_test = prepare_data(task_id)
    #print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)
    train_model_svm(X_train,Y_train,X_test,Y_test)





0.8842504743833017
0.9715370018975332
0.9089184060721063


SVM n-gram Model Binary classification

In [57]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler

In [107]:
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from transformers import get_linear_schedule_with_warmup


class TextClassificationDataset(Dataset):
    def __init__(self,df,tokenizer,max_len = 512):
        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len
        self.texts = df['Description']
        self.labels =  df['label'].apply(lambda x:0 if x==4 else 1).values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,index):
        text = self.texts[index]
        labels = self.labels[index]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
            padding=False 
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # Remove the batch dimension
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.stack(labels)


    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

def train_model(model, train_loader, device, optimizer, scheduler, epochs=3):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        for i,batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(input_ids,attention_mask = attention_mask,labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            if i % 20 == 0 :
                print(f'Epoch:{epoch} Step:{i}/{len(train_loader)} Loss:{loss.item()}')
            



def run_training(model_name,model_class,tokenizer,train_df,val_df,epochs = 3,learning_rate = 5e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_dataset = TextClassificationDataset(df_train,tokenizer)
    val_dataset = TextClassificationDataset(df_test,tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)

    model = model_class.from_pretrained(model_name, num_labels=2)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    train_model(model, train_loader, device, optimizer, scheduler, epochs)
   # accuracy = evaluate_model(model, val_loader, device)
    return accuracy

    


In [None]:
model_name = 'bert-base-uncased'
model_class = AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)

run_training(model_name,model_class,tokenizer,df_train,df_test)