# Base Model

A feedforward multilayer ANN for classification task of CVEs - in their vectorization format - into their corresponding CWE.

To do:
- Transform CVE data into vector format
- Train the model!

### Import the necessary libraries

Install libraries

In [15]:
# !pip install Pinecone
# !pip install torch
# !pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m:01[0m
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.0/435.0 KB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 KB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [38;2;114;156;31

Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from pinecone import Pinecone

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

from transformers import AutoTokenizer, AutoModelForMaskedLM

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

### Import data for supervised learning

Import data in the csv folders

In [5]:
number_of_cwes = 1427

### Train data
# Step 1: Load the CSV file
file_path_train = "train.csv"
data_train = pd.read_csv(file_path_train)

# Step 2: Separate X and Y
X_train = [s[:-1] if s.endswith('.') else s for s in data_train.iloc[:, 1].values]  # Keep only the CVE description
Y_train = data_train.iloc[:, 2:].values  # Latter two columns for targets

# Extract numeric part from 'CWE-' using list comprehension and regex
numeric_cwe_train = np.array([int(re.search(r'CWE-(\d{1,4})', row[0]).group(1)) for row in Y_train])

# Create one-hot encoded matrix
one_hot_encoded_fixed_train = np.zeros((len(numeric_cwe_train), number_of_cwes))  # Initialize with zeros

# Set the position corresponding to the CWE ID to 1
for i, cwe_id in enumerate(numeric_cwe_train):
    one_hot_encoded_fixed_train[i, cwe_id-1] = 1  # Set the position `cwe_id - 1` to 1

### Test data
file_path_test = "test.csv"
data_test = pd.read_csv(file_path_test)

# Step 2: Separate X and Y
X_test = [s[:-1] if s.endswith('.') else s for s in data_test.iloc[:, 1].values] # Keep only the CVE description
Y_test = data_test.iloc[:, 2:].values  # Latter two columns for targets

# Extract numeric part from 'CWE-' using list comprehension and regex
numeric_cwe_test = np.array([int(re.search(r'CWE-(\d{1,4})', row[0]).group(1)) for row in Y_test])

# Create one-hot encoded matrix
one_hot_encoded_fixed_test = np.zeros((len(numeric_cwe_test), number_of_cwes))  # Initialize with zeros

# Set the position corresponding to the CWE ID to 1
for i, cwe_id in enumerate(numeric_cwe_test):
    one_hot_encoded_fixed_test[i, cwe_id-1] = 1  # Set the position `cwe_id - 1` to 1

Loading model for embedding the CVEs description

In [6]:
# Set a version number for the model to manage multiple versions effectively
MODEL_VERSION = 1

# Load the fine-tuned model
model_path = f"/home/guilherme/Documents/MC959/Projeto/models/model_{MODEL_VERSION}/fine_tuned_lora_mlm"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model_embedding = AutoModelForMaskedLM.from_pretrained(model_path)

ValueError: Unrecognized model in /home/guilherme/Documents/MC959/Projeto/models/model_1/fine_tuned_lora_mlm. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

Transform train and test data into their embedding format

In [2]:
def encoder(inputs):
    embeddings = []
    for input in inputs:
        tokens =  tokenizer(input, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model_embedding.base_model(**tokens)  # Camada base
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() 
            embeddings.append(embedding)
    return embeddings

# Generate embeddings for a list of CVE descriptions
embeddings_train = encoder(X_train)
embeddings_test = encoder(X_test)

### ANN Definition

Hyperparameters

In [8]:
input_size = 768          # Input vector size (V_CVE_size)
num_classes = number_of_cwes              # Number of output classes (N_CWE)
hidden_sizes = [256, 128, 64]    # Sizes of hidden layers
activation_function = nn.ReLU    # Activation function to be used
batch_size = 32                  # Batch size
learning_rate = 1e-3             # Learning rate
num_epochs = 100                  # Number of training epochs
dropout_prob = 0.5             # Dropout probability for regularization (not used!)

Neural network model

In [9]:
# Build the neural network model dynamically
layers = []

# Input layer
layers.append(nn.Linear(input_size, hidden_sizes[0]))
layers.append(activation_function())

# Hidden layers
for i in range(len(hidden_sizes) - 1):
    layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
    layers.append(activation_function())
    layers.append(nn.Dropout(dropout_prob))

# Output layer
layers.append(nn.Linear(hidden_sizes[-1], num_classes))
layers.append(nn.Softmax(dim=1))

# Create the sequential model
model = nn.Sequential(*layers)

In [10]:
print("Neural Network Model:")
print(model)

Neural Network Model:
Sequential(
  (0): Linear(in_features=768, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=128, bias=True)
  (3): ReLU()
  (4): Dropout(p=0.3, inplace=False)
  (5): Linear(in_features=128, out_features=64, bias=True)
  (6): ReLU()
  (7): Dropout(p=0.3, inplace=False)
  (8): Linear(in_features=64, out_features=1365, bias=True)
  (9): Softmax(dim=1)
)


Loss function

In [31]:
# Loss function definiton

loss_fn = nn.CrossEntropyLoss()

Optimization method

In [33]:
# Defining the optimization method
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### ANN training

Prepare data for training

In [None]:
num_samples = 10

X_train = torch.tensor(embeddings_train, dtype=torch.float32)
y_train = torch.tensor(one_hot_encoded_fixed_train, dtype=torch.long)  # CrossEntropyLoss expects labels of type Long

# Create a TensorDataset from X_train and y_train
train_dataset = TensorDataset(X_train, y_train)

X_test = torch.tensor(embeddings_test, dtype=torch.float32)
y_test = torch.tensor(one_hot_encoded_fixed_test, dtype=torch.long)  # CrossEntropyLoss expects labels of type Long

# Create a TensorDataset from X_train and y_train
test_dataset = TensorDataset(X_test, y_test)

# # Ensure no missing values
# if np.isnan(X_array).any():
#     print("Data contains NaN values. Removing rows with NaN values...")
#     # Create a mask for rows without NaNs
#     mask = ~np.isnan(X_array).any(axis=1)
    
#     # Filter X_array and y_array using the mask
#     X_array = X_array[mask]
#     y_array = y_array[mask]

# # Convert data to PyTorch tensors
# X_tensor = torch.tensor(X_array, dtype=torch.float32)
# y_tensor = torch.tensor(y_array, dtype=torch.long)  # CrossEntropyLoss expects labels of type Long

# # Define split proportions
# train_size = int(0.7 * len(full_dataset))
# test_size = int(0.3 * len(full_dataset))

# # Split the dataset
# train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Store data for training and the accuracy of the model

In [None]:
# Lists to store loss and accuracy for plotting
train_losses = []
train_accuracies = []
val_accuracies = []

Choose the GPU, if avaliable

In [14]:
# Check if CUDA is available and use the GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model = model.to(device)

Training the model

In [None]:
for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        # # Move the batch to the GPU
        # batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        outputs = model(batch_X)
        loss = loss_fn(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # # Validation phase (removed because of time issues)
    # model.eval()
    # val_loss = 0
    # val_correct = 0
    # val_total = 0
    # with torch.no_grad():
    #     for val_X, val_y in val_loader:
    #         outputs = model(val_X)
    #         loss = loss_fn(outputs, val_y)
    #         val_loss += loss.item()
    #         _, predicted = torch.max(outputs.data, 1)
    #         val_total += val_y.size(0)
    #         val_correct += (predicted == val_y).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * val_correct / val_total
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    # Save the model after each epoch
    torch.save(model.state_dict(), "base_model.pth")

    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, '
          f'Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')

### ANN analysis

Data from training process

In [None]:
# Visualization of Loss and Accuracy

# Plot Loss over epochs
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot Accuracy over epochs
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()

plt.show()

Model final evaluation

In [None]:
# Evaluate on Test Data and Compute Performance Metrics

model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for test_X, test_y in test_loader:
        outputs = model(test_X)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(test_y.cpu().numpy())

# Compute metrics
test_accuracy = accuracy_score(all_labels, all_preds) * 100
precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

print(f'Test Accuracy: {test_accuracy:.2f}%')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Save the base model

In [34]:
# Save the trained model
torch.save(model.state_dict(), "base_model.pth")
print("Model saved as 'base_model.pth'")

Model saved as 'base_model.pth'
