In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, PeftModel
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import pandas as pd
import os




### 1. Data ETL 

In [2]:
raw_data_base_dir = "./raw_data"

# Load dataset for self-supervised learning 
cve_dataset = pd.read_csv(f"{raw_data_base_dir}/cve_dataset.csv")
cwe_dataset = pd.read_csv(f"{raw_data_base_dir}/cwe_dataset.csv")

# Exlude NA lines and drop cwe column from cve dataset
cve_dataset.dropna(subset=['description'], inplace=True)
cve_dataset.drop(columns=['cwe'], inplace=True)

# Exlude NA lines from cwe dataset
cwe_dataset.dropna(subset=['title','description'], inplace=True)
cwe_dataset['metadescription'] = cwe_dataset['cwe_id'] + " - " + cwe_dataset['title'] + " : " + cwe_dataset['description']

# Load dataset for supervised learning
cwe2cve_dataset = pd.read_csv(f"{raw_data_base_dir}/cwe2cve_dataset.csv")

### 2. Split Dataset for Fine-tuning

In [3]:
dataset_base_dir = "./datasets"

# Split CVE dataset into train and test
cve_train_data, cve_test_data = train_test_split(
    cve_dataset['description'], 
    test_size=0.2, 
    random_state=42
)

# Split CWE dataset into train and test
cwe_train_data, cwe_test_data = train_test_split(
    cwe_dataset['metadescription'], 
    test_size=0.2, 
    random_state=42
)

# Combine CVE and CWE training datasets into a single train dataset
train_dataset = pd.concat([
    pd.DataFrame({'text': cve_train_data}),
    pd.DataFrame({'text': cwe_train_data})
], ignore_index=True)

# Combine CVE and CWE test datasets into a single test dataset
test_dataset = pd.concat([
    pd.DataFrame({'text': cve_test_data}),
    pd.DataFrame({'text': cwe_test_data})
], ignore_index=True)

# Remove rows with missing values (NA) from train and test datasets
train_dataset = train_dataset.dropna()
test_dataset = test_dataset.dropna()

# Shuffle both datasets to ensure randomness
train_dataset = train_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the train and test datasets as .csv files
train_dataset.to_csv(f"./{dataset_base_dir}/train.csv", index=False)
test_dataset.to_csv(f"./{dataset_base_dir}/test.csv", index=False)

# Save only the 'text' column as .txt files for model consumption
train_dataset['text'].to_csv(f"./{dataset_base_dir}/train.txt", index=False, header=False)
test_dataset['text'].to_csv(f"./{dataset_base_dir}/test.txt", index=False, header=False)


In [4]:
# Load the RoBERTa tokenizer and base model for Masked Language Modeling (MLM)
model_name = "roberta-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Tokenizer for tokenizing input text
base_model = AutoModelForMaskedLM.from_pretrained(model_name)  # RoBERTa model for MLM

print([name for name, _ in base_model.named_modules()])

['', 'roberta', 'roberta.embeddings', 'roberta.embeddings.word_embeddings', 'roberta.embeddings.position_embeddings', 'roberta.embeddings.token_type_embeddings', 'roberta.embeddings.LayerNorm', 'roberta.embeddings.dropout', 'roberta.encoder', 'roberta.encoder.layer', 'roberta.encoder.layer.0', 'roberta.encoder.layer.0.attention', 'roberta.encoder.layer.0.attention.self', 'roberta.encoder.layer.0.attention.self.query', 'roberta.encoder.layer.0.attention.self.key', 'roberta.encoder.layer.0.attention.self.value', 'roberta.encoder.layer.0.attention.self.dropout', 'roberta.encoder.layer.0.attention.output', 'roberta.encoder.layer.0.attention.output.dense', 'roberta.encoder.layer.0.attention.output.LayerNorm', 'roberta.encoder.layer.0.attention.output.dropout', 'roberta.encoder.layer.0.intermediate', 'roberta.encoder.layer.0.intermediate.dense', 'roberta.encoder.layer.0.intermediate.intermediate_act_fn', 'roberta.encoder.layer.0.output', 'roberta.encoder.layer.0.output.dense', 'roberta.encod

### 3. LoRa Fine-Tunning 

In [5]:
# Set a version number for the model to manage multiple versions effectively
MODEL_VERSION = 2

# Create a directory for storing the model outputs specific to the current version
os.makedirs(f"./models/model_{MODEL_VERSION}", exist_ok=True)

# Load the RoBERTa tokenizer and base model for Masked Language Modeling (MLM)
model_name = "roberta-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Tokenizer for tokenizing input text
base_model = AutoModelForMaskedLM.from_pretrained(model_name)  # RoBERTa model for MLM

print([name for name, _ in base_model.named_modules()])

# Configure LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning
lora_config = LoraConfig(
    task_type="CAUSAL_LM",  # Specifies that the task is Masked Language Modeling
    r=8,                    # Rank of the low-rank matrices used for adaptation
    lora_alpha=16,          # Scaling factor for the LoRA update
    lora_dropout=0.1,       # Dropout probability to improve generalization
    target_modules=["query", "value"], # Modules in transformer layers to be adapted
)

# Apply LoRA configuration to the base model, creating a parameter-efficient fine-tuned model
model = get_peft_model(base_model, lora_config)

# Load the dataset from text files containing training and testing data
dataset = load_dataset("text", data_files={"train": "./datasets/train.txt", "test": "./datasets/test.txt"})

# Define a preprocessing function to tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset using the preprocessing function
# Batched processing speeds up the tokenization, and the original "text" column is removed
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Data collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,      # Tokenizer is used to dynamically mask tokens in batches
    mlm=True,                 # Indicates that MLM is enabled
    mlm_probability=0.15,     # Probability of masking tokens in input sequences
)

# Define training arguments for the Trainer API
training_args = TrainingArguments(
    output_dir=f"./models/model_{MODEL_VERSION}/results_lora",  # Directory to save results
    evaluation_strategy="epoch",          # Evaluate the model at the end of each epoch
    learning_rate=5e-4,                   # Initial learning rate for optimization
    per_device_train_batch_size=16,       # Batch size for training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    num_train_epochs=3,                   # Number of epochs to train the model
    weight_decay=0.01,                    # Regularization parameter to prevent overfitting
    save_total_limit=2,                   # Keep only the latest two checkpoints to save disk space
    logging_steps=100,                    # Log metrics every 100 steps
    save_steps=500,                       # Save the model checkpoint every 500 steps
    report_to="none",                     # Disable external reporting (e.g., TensorBoard or WandB)
    fp16=True,                            # Use mixed precision training for faster computation on GPUs
)

# Create a Trainer instance to manage the training loop
trainer = Trainer(
    model=model,                          # The fine-tunable model with LoRA applied
    args=training_args,                   # Training arguments defined above
    train_dataset=tokenized_datasets["train"],  # Tokenized training dataset
    eval_dataset=tokenized_datasets["test"],    # Tokenized evaluation dataset
    tokenizer=tokenizer,                  # Tokenizer for handling text input
    data_collator=data_collator,          # Data collator for dynamically masking tokens
)

# Start the fine-tuning process
trainer.train()

# Save the fine-tuned model and tokenizer to the specified directory
model.save_pretrained(f"./models/model_{MODEL_VERSION}/fine_tuned_lora_mlm")
tokenizer.save_pretrained(f"./models/model_{MODEL_VERSION}/fine_tuned_lora_mlm")


['', 'roberta', 'roberta.embeddings', 'roberta.embeddings.word_embeddings', 'roberta.embeddings.position_embeddings', 'roberta.embeddings.token_type_embeddings', 'roberta.embeddings.LayerNorm', 'roberta.embeddings.dropout', 'roberta.encoder', 'roberta.encoder.layer', 'roberta.encoder.layer.0', 'roberta.encoder.layer.0.attention', 'roberta.encoder.layer.0.attention.self', 'roberta.encoder.layer.0.attention.self.query', 'roberta.encoder.layer.0.attention.self.key', 'roberta.encoder.layer.0.attention.self.value', 'roberta.encoder.layer.0.attention.self.dropout', 'roberta.encoder.layer.0.attention.output', 'roberta.encoder.layer.0.attention.output.dense', 'roberta.encoder.layer.0.attention.output.LayerNorm', 'roberta.encoder.layer.0.attention.output.dropout', 'roberta.encoder.layer.0.intermediate', 'roberta.encoder.layer.0.intermediate.dense', 'roberta.encoder.layer.0.intermediate.intermediate_act_fn', 'roberta.encoder.layer.0.output', 'roberta.encoder.layer.0.output.dense', 'roberta.encod

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/200200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50051 [00:00<?, ? examples/s]

  trainer = Trainer(


  0%|          | 0/37539 [00:00<?, ?it/s]

{'loss': 3.1368, 'grad_norm': 2.732752799987793, 'learning_rate': 0.0004986680518926982, 'epoch': 0.01}
{'loss': 3.0099, 'grad_norm': 3.0292582511901855, 'learning_rate': 0.0004973361037853965, 'epoch': 0.02}
{'loss': 2.913, 'grad_norm': 2.477691411972046, 'learning_rate': 0.0004960041556780948, 'epoch': 0.02}
{'loss': 2.8346, 'grad_norm': 2.91054105758667, 'learning_rate': 0.0004946722075707931, 'epoch': 0.03}
{'loss': 2.8479, 'grad_norm': 2.6438703536987305, 'learning_rate': 0.0004933535789445644, 'epoch': 0.04}
{'loss': 2.824, 'grad_norm': 3.6584205627441406, 'learning_rate': 0.0004920216308372626, 'epoch': 0.05}
{'loss': 2.7525, 'grad_norm': 3.443819999694824, 'learning_rate': 0.0004906896827299609, 'epoch': 0.06}
{'loss': 2.7015, 'grad_norm': 2.6627304553985596, 'learning_rate': 0.0004893577346226591, 'epoch': 0.06}
{'loss': 2.7652, 'grad_norm': 3.3510396480560303, 'learning_rate': 0.0004880257865153574, 'epoch': 0.07}
{'loss': 2.7167, 'grad_norm': 4.011176586151123, 'learning_rat

  0%|          | 0/3129 [00:00<?, ?it/s]

{'eval_loss': 2.0636942386627197, 'eval_runtime': 186.7836, 'eval_samples_per_second': 267.962, 'eval_steps_per_second': 16.752, 'epoch': 1.0}
{'loss': 2.2443, 'grad_norm': 4.779221534729004, 'learning_rate': 0.0003322544553664189, 'epoch': 1.01}
{'loss': 2.3147, 'grad_norm': 5.842565536499023, 'learning_rate': 0.0003309225072591172, 'epoch': 1.01}
{'loss': 2.2626, 'grad_norm': 5.922913074493408, 'learning_rate': 0.00032959055915181545, 'epoch': 1.02}
{'loss': 2.237, 'grad_norm': 4.585598945617676, 'learning_rate': 0.0003282586110445137, 'epoch': 1.03}
{'loss': 2.3169, 'grad_norm': 6.996572017669678, 'learning_rate': 0.00032692666293721197, 'epoch': 1.04}
{'loss': 2.2411, 'grad_norm': 5.235696315765381, 'learning_rate': 0.00032559471482991023, 'epoch': 1.05}
{'loss': 2.3266, 'grad_norm': 4.901758193969727, 'learning_rate': 0.0003242627667226085, 'epoch': 1.05}
{'loss': 2.2313, 'grad_norm': 5.304924964904785, 'learning_rate': 0.00032293081861530675, 'epoch': 1.06}
{'loss': 2.2944, 'grad

  0%|          | 0/3129 [00:00<?, ?it/s]

{'eval_loss': 1.9804930686950684, 'eval_runtime': 174.8271, 'eval_samples_per_second': 286.289, 'eval_steps_per_second': 17.898, 'epoch': 2.0}
{'loss': 2.1674, 'grad_norm': 4.986512660980225, 'learning_rate': 0.0001658275393590666, 'epoch': 2.01}
{'loss': 2.1813, 'grad_norm': 4.670063018798828, 'learning_rate': 0.00016449559125176482, 'epoch': 2.01}
{'loss': 2.1934, 'grad_norm': 6.169525146484375, 'learning_rate': 0.0001631636431444631, 'epoch': 2.02}
{'loss': 2.1633, 'grad_norm': 5.671971321105957, 'learning_rate': 0.00016183169503716135, 'epoch': 2.03}
{'loss': 2.154, 'grad_norm': 6.124148368835449, 'learning_rate': 0.0001604997469298596, 'epoch': 2.04}
{'loss': 2.1516, 'grad_norm': 6.246192455291748, 'learning_rate': 0.0001591677988225579, 'epoch': 2.05}
{'loss': 2.1772, 'grad_norm': 6.866201400756836, 'learning_rate': 0.00015783585071525613, 'epoch': 2.05}
{'loss': 2.1887, 'grad_norm': 4.121890544891357, 'learning_rate': 0.0001565039026079544, 'epoch': 2.06}
{'loss': 2.2179, 'grad_

  0%|          | 0/3129 [00:00<?, ?it/s]

{'eval_loss': 1.9573867321014404, 'eval_runtime': 183.9104, 'eval_samples_per_second': 272.149, 'eval_steps_per_second': 17.014, 'epoch': 3.0}
{'train_runtime': 5139.9484, 'train_samples_per_second': 116.849, 'train_steps_per_second': 7.303, 'train_loss': 2.278235405685899, 'epoch': 3.0}


('./models/model_2/fine_tuned_lora_mlm\\tokenizer_config.json',
 './models/model_2/fine_tuned_lora_mlm\\special_tokens_map.json',
 './models/model_2/fine_tuned_lora_mlm\\vocab.json',
 './models/model_2/fine_tuned_lora_mlm\\merges.txt',
 './models/model_2/fine_tuned_lora_mlm\\added_tokens.json',
 './models/model_2/fine_tuned_lora_mlm\\tokenizer.json')