In [1]:
!pip install -q bitsandbytes peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [2]:
import os
import shutil
from datasets import load_from_disk, Dataset

# Source data path (read-only)
source_path = "/kaggle/input/cyberdata/nist_cybersecurity_dataset"

# Destination path (writable)
working_dir = "/kaggle/working/nist_data"
os.makedirs(working_dir, exist_ok=True)

# Load the dataset
print("Loading dataset...")
dataset = load_from_disk(source_path)
print(f"Dataset loaded with {len(dataset)} documents")

# Examine the first document
print("\nDocument metadata:")
print(dataset[0]['metadata'])

print("\nSample text:")
print(dataset[0]['text'][:100])

Loading dataset...
Dataset loaded with 6 documents

Document metadata:
{'filename': 'NIST.SP.800-63-3.pdf', 'length': 147851, 'source': 'github', 'url': 'https://raw.githubusercontent.com/fractional-ciso/NIST-Cybersecurity-Documents/master/NIST.SP.800-63-3.pdf'}

Sample text:
NIST Special Publication 800-63-3  Digital Identity Guidelines  Paul A. Grassi Michael E. Garcia Jam


In [None]:
# Basic preprocessing function
def preprocess_text(text, max_length=100000):
    """Basic preprocessing of text data"""
    # Remove excessive whitespace
    text = ' '.join(text.split())

    # Truncate very long documents if needed
    if len(text) > max_length:
        text = text[:max_length]

    return text

In [None]:
# Create a new processed dataset in memory
print("Preprocessing documents...")
processed_texts = []
metadata_list = []

for i, example in enumerate(dataset):
    # Process text
    processed_text = preprocess_text(example['text'])
    processed_texts.append(processed_text)
    metadata_list.append(example['metadata'])

    # Print progress for every few documents
    if (i+1) % 2 == 0 or i == len(dataset)-1:
        print(f"Processed {i+1}/{len(dataset)} documents")

# Create a new dataset with processed data
processed_dataset = Dataset.from_dict({
    'processed_text': processed_texts,
    'metadata': metadata_list
})

# Save to writable directory
processed_dataset.save_to_disk(os.path.join(working_dir, "processed_dataset"))
print(f"Processed dataset saved to {os.path.join(working_dir, 'processed_dataset')}")

# Display sample
print("\nSample processed text:")
print(processed_dataset[0]['processed_text'][:500])

Preprocessing documents...
Processed 2/6 documents
Processed 4/6 documents
Processed 6/6 documents


Saving the dataset (0/1 shards):   0%|          | 0/6 [00:00<?, ? examples/s]

Processed dataset saved to /kaggle/working/nist_data/processed_dataset

Sample processed text:
NIST Special Publication 800-63-3 Digital Identity Guidelines Paul A. Grassi Michael E. Garcia James L. Fenton This publication is available free of charge from: https://doi.org/10.6028/NIST.SP.800-63-3 NIST Special Publication 800-63-3 Digital Identity Guidelines Paul A. Grassi Michael E. Garcia Applied Cybersecurity Division Information Technology Laboratory James L. Fenton Altmode Networks Los Altos, Calif. This publication is available free of charge from: https://doi.org/10.6028/NIST.SP.800


In [None]:
from transformers import (
    AutoTokenizer,
    pipeline,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
# Define working directory
working_dir = "/kaggle/working/nist_data"
os.makedirs(working_dir, exist_ok=True)

# Load the processed dataset
processed_dataset_path = os.path.join(working_dir, "processed_dataset")
processed_dataset = load_from_disk(processed_dataset_path)
print(f"Loaded processed dataset with {len(processed_dataset)} documents")

Loaded processed dataset with 6 documents


In [7]:
model_id = "Qwen/Qwen2-1.5B"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Padding on the right side

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
# Check model's context window
print(f"Model max context length: {tokenizer.model_max_length} tokens")

# Function to format data for instruction fine-tuning
def create_instruction_format(example):
    """Create instruction-response pairs from the document"""
    filename = example['metadata']['filename']
    text = example['processed_text']

    # Creating a simple instruction format
    instruction = f"Summarize the key points from this NIST cybersecurity document: {filename}"
    response = text[:2000]  # Using first part as a summary for this example

    # Format for instruction tuning
    formatted_text = f"<|user|>\n{instruction}\n<|assistant|>\n{response}"
    return {"formatted_text": formatted_text}

# Apply formatting
print("Creating instruction-response pairs...")
instruction_texts = []
for example in processed_dataset:
    formatted = create_instruction_format(example)
    instruction_texts.append(formatted["formatted_text"])

# Create a new dataset with formatted data
instruction_dataset = Dataset.from_dict({
    'formatted_text': instruction_texts
})


Model max context length: 32768 tokens
Creating instruction-response pairs...


In [9]:
# Display sample
print("\nSample instruction-response format:")
print(instruction_dataset[0]['formatted_text'])


Sample instruction-response format:
<|user|>
Summarize the key points from this NIST cybersecurity document: NIST.SP.800-63-3.pdf
<|assistant|>
NIST Special Publication 800-63-3 Digital Identity Guidelines Paul A. Grassi Michael E. Garcia James L. Fenton This publication is available free of charge from: https://doi.org/10.6028/NIST.SP.800-63-3 NIST Special Publication 800-63-3 Digital Identity Guidelines Paul A. Grassi Michael E. Garcia Applied Cybersecurity Division Information Technology Laboratory James L. Fenton Altmode Networks Los Altos, Calif. This publication is available free of charge from: https://doi.org/10.6028/NIST.SP.800-63-3 June 2017 INCLUDES UPDATES AS OF 12-01-2017; PAGE X U.S. Department of Commerce Wilbur L. Ross, Jr., Secretary National Institute of Standards and Technology Kent Rochford, Acting NIST Director and Under Secretary of Commerce for Standards and Technology Authority This publication has been developed by NIST in accordance with its statutory respons

In [10]:
# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True
)

# Prepare the model for training
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [11]:
# Set up LoRA (Low-Rank Adaptation) for efficient fine-tuning
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
# Tokenize function
def tokenize_batch(batch_texts):
    """Tokenize a batch of texts"""
    return tokenizer(
        batch_texts,
        truncation=True,
        padding="max_length",
        max_length=512  # Adjust based on your model and GPU capabilities
    )

# Tokenize in batches to avoid memory issues
print("Tokenizing the dataset...")
batch_size = 2  # Small batch size due to potentially large texts
all_input_ids = []
all_attention_masks = []

for i in range(0, len(instruction_dataset), batch_size):
    batch_texts = instruction_dataset[i:i+batch_size]['formatted_text']
    tokenized = tokenize_batch(batch_texts)

    all_input_ids.extend(tokenized['input_ids'])
    all_attention_masks.extend(tokenized['attention_mask'])

    if (i+batch_size) % 4 == 0 or i+batch_size >= len(instruction_dataset):
        print(f"Tokenized {min(i+batch_size, len(instruction_dataset))}/{len(instruction_dataset)} examples")

# Create tokenized dataset
tokenized_dataset = Dataset.from_dict({
    'input_ids': all_input_ids,
    'attention_mask': all_attention_masks
})

# Split into train and validation sets
tokenized_splits = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
print(f"Training set: {len(tokenized_splits['train'])} examples")
print(f"Validation set: {len(tokenized_splits['test'])} examples")

# Save tokenized datasets
tokenized_splits.save_to_disk(os.path.join(working_dir, "tokenized_dataset"))
print(f"Tokenized dataset saved to {os.path.join(working_dir, 'tokenized_dataset')}")

Tokenizing the dataset...
Tokenized 4/6 examples
Tokenized 6/6 examples
Training set: 5 examples
Validation set: 1 examples


Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenized dataset saved to /kaggle/working/nist_data/tokenized_dataset


In [14]:
import transformers

# Create output directory
output_dir = "./qwen-fact-checking"
os.makedirs(output_dir, exist_ok=True)

# Updated TrainingArguments with non-deprecated parameters
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
)

# Updated Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_splits["train"],
    eval_dataset=tokenized_splits["test"],
    data_collator=transformers.DataCollatorForLanguageModeling(  # Explicit data collator
        tokenizer=tokenizer,
        mlm=False  # For causal LM
    )
)

In [15]:
# Start training
print("Starting training...")
trainer.train()

# Save the fine-tuned model
peft_model_path = os.path.join(output_dir, "final_model")
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)
print(f"Model saved to {peft_model_path}")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


Model saved to ./qwen-fact-checking/final_model


In [None]:
# Check if the model files exist
peft_model_path = os.path.join(output_dir, "final_model")
print("Model files exist:", os.path.exists(peft_model_path))
print("Contents:", os.listdir(peft_model_path))

Model files exist: True
Contents: ['vocab.json', 'adapter_config.json', 'adapter_model.safetensors', 'tokenizer.json', 'added_tokens.json', 'README.md', 'merges.txt', 'special_tokens_map.json', 'tokenizer_config.json']


In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load fine-tuned model - use the same path where you saved it
finetuned_model_path = "./qwen-fact-checking/final_model"  # or peft_model_path from earlier

# First load the model and tokenizer separately
model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_path,
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

# Then create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

Device set to use cuda:0


In [None]:
# Test with a few cybersecurity questions
test_questions = [
    "What are the main components of the NIST Cybersecurity Framework?",
    "How should organizations respond to cybersecurity incidents according to NIST?",
    "What is the principle of least privilege in cybersecurity?"
]

for question in test_questions:
    prompt = f"<|user|>\n{question}\n<|assistant|>\n"
    print(f"\nQuestion: {question}")

    # Generate response
    response = pipe(
        prompt,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )[0]['generated_text']

    # Extract just the assistant's response
    assistant_response = response.split("<|assistant|>\n")[1]
    print(f"Response: {assistant_response}")


Question: What are the main components of the NIST Cybersecurity Framework?
Response: The NIST Cybersecurity Framework (NIST Cyber Framework) is a comprehensive framework that aims to provide a common language for cybersecurity professionals and organizations to follow when implementing cybersecurity measures. The framework consists of several main components, including:

1. The NIST Cybersecurity Framework (NIST CSF): This is the foundational component of the framework and provides a set of principles and guidelines that organizations can use to assess and improve their cybersecurity posture. The NIST CSF includes 84 principles that cover a wide range of cybersecurity areas, such as protecting data, protecting people, and protecting networks.
2. NIST Cybersecurity Framework Implementation Guide: This guide provides detailed guidance on how to implement the principles of the NIST CSF. It includes step-by-step instructions on how to assess an organization's cybersecurity posture, ident

## Improvement

In [None]:
def extract_assistant_response(response):
    """Extract only the assistant's response, stopping at any new user message"""
    # Split to get everything after assistant tag
    parts = response.split("<|assistant|>\n")
    if len(parts) < 2:
        return response  # Fallback if format is unexpected

    assistant_text = parts[1]

    # Stop at the next user tag if it exists
    if "<|user|>" in assistant_text:
        assistant_text = assistant_text.split("<|user|>")[0].strip()

    return assistant_text

In [None]:
# Test with a few cybersecurity questions with improved prompting
test_questions = [
    "What are the main components of the NIST Cybersecurity Framework?",
    "How should organizations respond to cybersecurity incidents according to NIST?",
    "What is the principle of least privilege in cybersecurity?"
]

for question in test_questions:
    # Improved prompt with instructions for formatting
    prompt = f"<|user|>\nAnswer the following cybersecurity question clearly and concisely. Use numbered lists where appropriate and be direct.\n\nQuestion: {question}\n<|assistant|>\n"
    print(f"\nQuestion: {question}")

    # Generate response with repetition penalty
    response = pipe(
        prompt,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.2  # Add repetition penalty to reduce repeating content
    )[0]['generated_text']

    # Extract just the assistant's response
    assistant_response = extract_assistant_response(response)
    print(f"Response: {assistant_response}")


Question: What are the main components of the NIST Cybersecurity Framework?
Response: The main components of the National Institute of Standards and Technology's (NIST) Cybersecurity Framework include:

1. Policies, which set out the overall direction for a security program.
2. Processes, which describe how to implement policies in practice.
3. Controls, which identify specific actions that should be taken at each level of risk management.
4. Information resources, such as information assets, vulnerabilities, and tools used during the assessment process.
5. Evaluation metrics, including cost-benefit analysis, effectiveness measures, and compliance reporting requirements.

Question: How should organizations respond to cybersecurity incidents according to NIST?
Response: 1. Incident response planning is critical for effectively responding to a cyber incident.
2. Organizations must establish an incident response plan that includes roles, responsibilities, communication protocols, contain

In [None]:
def clean_response(response):
    """Clean up the model response by:
    1. Removing any trailing incomplete sentences
    2. Removing duplicate content
    3. Trimming whitespace
    4. Removing any remaining special tokens or tags
    """
    # Remove special tokens if they exist
    for token in ["<|user|>", "<|assistant|>", "<|end|>"]:
        response = response.replace(token, "")

    # Remove duplicate sentences (simple approach)
    sentences = [s.strip() for s in response.split(".") if s.strip()]
    unique_sentences = []
    seen_sentences = set()

    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)

    cleaned = ". ".join(unique_sentences)
    if cleaned and not cleaned.endswith("."):
        cleaned += "."

    return cleaned

In [None]:
def generate_improved_response(pipe, question, max_length=512):
    """Generate a response with improved prompting and cleaning"""
    # Enhanced prompt with clear instructions
    prompt = f"""<|user|>
Answer the following cybersecurity question clearly and concisely.
Focus on providing accurate information from NIST guidelines where applicable.
Structure your response with clear paragraphs or bullet points when listing items.
Do not repeat information unnecessarily.

Question: {question}
<|assistant|>
"""

    # Generate response with better parameters
    response = pipe(
        prompt,
        max_new_tokens=max_length,
        temperature=0.7,  # Balanced between creativity and factuality
        top_p=0.9,
        top_k=50,
        do_sample=True,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )[0]['generated_text']

    # Extract and clean the assistant's response
    assistant_response = extract_assistant_response(response)
    return clean_response(assistant_response)

# Test with improved generation
test_questions = [
    "What are the main components of the NIST Cybersecurity Framework?",
    "How should organizations respond to cybersecurity incidents according to NIST?",
    "What is the principle of least privilege in cybersecurity?",
    "Explain the concept of zero trust in cybersecurity",
    "What are NIST's recommendations for password policies?"
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    response = generate_improved_response(pipe, question)
    print(f"Response: {response}\n")
    print("-" * 80)


Question: What are the main components of the NIST Cybersecurity Framework?
Response: The NIST (National Institute of Standards and Technology) Cybersecurity framework is a set of best practices that organizations can follow to improve their overall security posture against cyber threats. It consists of four key pillars:

1. Governance - This pillar focuses on setting up an effective governance structure within an organization, including defining roles and responsibilities for different stakeholders such as management, IT staff, and employees. 2. Risk Management - In this pillar, it covers how organizations identify risks associated with potential attacks by analyzing threat intelligence data, vulnerability assessments, risk-based prioritization processes, and incident handling procedures. 3. Operations - This component deals with ensuring secure operations through robust access controls, user training and awareness programs, continuous monitoring, patching and updating systems regula

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Response: The concept of Zero Trust is a framework that aims to provide an end-to-end security model for all network interactions, regardless of whether they are internal or external to the organization's networks. It recognizes that traditional security models based on authentication and access control do not adequately address modern threats such as insider attacks, phishing, social engineering, advanced persistent threats (APTs), and more sophisticated malware infections. In the Zero Trust approach, users' identities and devices must be verified before granting them any level of privilege within the network. This means implementing strict policies around user identity management, device authenticity checks, endpoint protection measures, multi-factor authentication mechanisms, intrusion detection systems (IDS) and anti-malware software installations, among others. Zero Trust also emphasizes continuous monitoring by using real-time analytics tools to detect anomalous activity patterns

### Downlaod the Saved Model

In [24]:
from IPython.display import FileLink

# Compress the model directory
!tar -czvf qwen-fact-checking.tar.gz ./qwen-fact-checking/final_model/

# Create download link
FileLink('qwen-fact-checking.tar.gz')

./qwen-fact-checking/final_model/
./qwen-fact-checking/final_model/vocab.json
./qwen-fact-checking/final_model/adapter_config.json
./qwen-fact-checking/final_model/adapter_model.safetensors
./qwen-fact-checking/final_model/tokenizer.json
./qwen-fact-checking/final_model/added_tokens.json
./qwen-fact-checking/final_model/README.md
./qwen-fact-checking/final_model/merges.txt
./qwen-fact-checking/final_model/special_tokens_map.json
./qwen-fact-checking/final_model/tokenizer_config.json
