In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
!pip install triton



In [3]:
!pip install xformers --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cu121
Collecting xformers
  Downloading xformers-0.0.29.dev941-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Downloading xformers-0.0.29.dev941-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.7/16.7 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.26.post1
    Uninstalling xformers-0.0.26.post1:
      Successfully uninstalled xformers-0.0.26.post1
Successfully installed xformers-0.0.29.dev941


In [4]:
import torch
print(torch.cuda.is_available())

True


In [5]:
pip uninstall xformers

Found existing installation: xformers 0.0.29.dev941
Uninstalling xformers-0.0.29.dev941:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/xformers-0.0.29.dev941.dist-info/*
    /usr/local/lib/python3.10/dist-packages/xformers/*
Proceed (Y/n)? y
  Successfully uninstalled xformers-0.0.29.dev941


In [6]:
pip install xformers --pre

Collecting xformers
  Using cached xformers-0.0.29.dev941-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.29.dev941-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
Installing collected packages: xformers
Successfully installed xformers-0.0.29.dev941


In [7]:
pip install xformers --pre



In [8]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.dev941. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.10 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [12]:
from datasets import load_dataset
# Define the prompt template
alpaca_prompt = """You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided.
Return your answer as a single uppercase letter: A, B, C, or D.

### Question:
{}

### Options:
A. {}
B. {}
C. {}
D. {}

### Correct Answer:
{}"""

# Define the End of Sequence (EOS) token
EOS_TOKEN = "<|endoftext|>"  # Replace with the actual EOS token used in your model
eos_token_id = tokenizer.convert_tokens_to_ids(EOS_TOKEN)
# Example data preparation function for your QCM dataset
def formatting_prompts_func(examples):
    questions = examples["Question"]
    option_a = examples["Option A"]
    option_b = examples["Option B"]
    option_c = examples["Option C"]
    option_d = examples["Option D"]
    correct_answer = examples["GT"]

    texts = []
    for question, a, b, c, d, correct in zip(questions, option_a, option_b, option_c, option_d, correct_answer):
        # Format the text based on your QCM data
        text = alpaca_prompt.format(question, a, b, c, d, correct.strip()) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Loading and processing the dataset
dataset = load_dataset('csv', data_files='/content/cti-mcq (1).tsv', delimiter='\t',encoding='ISO-8859-1') # Specify 'csv' format and provide the correct file path
dataset = dataset.map(formatting_prompts_func, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # This can make training faster for short sequences
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Train the model
trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/2500 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,500 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.5403
2,2.5306
3,2.7044
4,2.5656
5,2.4637
6,2.5242
7,2.5931
8,2.5006
9,2.6542
10,2.623


In [14]:
# Charger le modèle pour l'inférence
FastLanguageModel.for_inference(model)  # Activer l'inférence rapide
# Préparer une question de votre dataset pour l'inférence
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?",  # exemple de question
            "Audit",  # Option A
            "Execution Prevention",  # Option B
            "Operating System Configuration",    # Option C
            "User Account Control",  # Option D
            ""  # output vide pour génération
        )
    ],
    return_tensors="pt"
).to("cuda")

# Utiliser un streamer pour afficher le texte généré en temps réel
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Générer la réponse avec un maximum de 128 nouveaux tokens
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=7
)
# Decode the generated output
generated_text = tokenizer.decode(_[0], skip_special_tokens=False)

# Post-process to extract the answer
answer = generated_text.split("### Correct Answer:")[-1].strip()

# Optionally, truncate excess characters if they exist
if len(answer) > 1:
    answer = answer[0]  # Keep only the first character (A, B, C, or D)

print(f"Generated Answer: {answer}")


<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided.
Return your answer as a single uppercase letter: A, B, C, or D.

### Question:
Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?

### Options:
A. Audit
B. Execution Prevention
C. Operating System Configuration
D. User Account Control

### Correct Answer:
B<|end_of_text|>
Generated Answer: B


In [16]:
import pandas as pd

# Load the TSV file into a DataFrame
df = pd.read_csv('/content/cti-mcq (1).tsv', delimiter='\t',encoding='ISO-8859-1')

# Check the structure of the DataFrame
print(df.head())

                                          URL  \
0  https://attack.mitre.org/techniques/T1548/   
1  https://attack.mitre.org/techniques/T1548/   
2  https://attack.mitre.org/techniques/T1548/   
3  https://attack.mitre.org/techniques/T1548/   
4  https://attack.mitre.org/techniques/T1548/   

                                            Question  \
0  Which of the following mitigations involves pr...   
1  Which data source is recommended for monitorin...   
2  What does mitigation ID M1028 suggest to preve...   
3  Which process creation is an indicator of pote...   
4  In a Linux environment, what is recommended to...   

                                    Option A  \
0                                      Audit   
1                                    Command   
2      Limiting privileges of cloud accounts   
3           C:\Windows\System32\services.exe   
4  Monitor Windows Registry Key Modification   

                                        Option B  \
0                          

In [17]:
# Initialize an empty list to store the outputs
FastLanguageModel.for_inference(model)
generated_outputs = []

for _, row in df.iterrows():
    # Format the prompt with the question and options from the dataset
    prompt = alpaca_prompt.format(
        row['Question'],
        row['Option A'],
        row['Option B'],
        row['Option C'],
        row['Option D'],
        ""
    )

    # Tokenize the input prompt
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer)
    # Generate the output
    output = model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=7

    )

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer (A, B, C, or D)
    answer = generated_text.split("### Correct Answer:")[-1].strip()

    # Append the generated answer to the list
    generated_outputs.append(answer)

# Add the generated outputs as a new column in the DataFrame
df['Generated Output'] = generated_outputs
output_file = '/content/cti-mcq-with-output.tsv'
df.to_csv(output_file, sep='\t', index=False)

print(f"Updated dataset saved to {output_file}")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
A
Explanation:

In the context
<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided.
Return your answer as a single uppercase letter: A, B, C, or D.

### Question:
Which CWE does NOT relate directly to buffer overflow issues in the context of CAPEC-8?

### Options:
A. CWE-118
B. CWE-733
C. CWE-120
D. CWE-680

### Correct Answer:
D<|end_of_text|>
<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided.
Return your answer as a single uppercase letter: A, B, C, or D.

### Question:
What is one of the primary purposes for an adversary to manipulate registry information in the context of CAPEC-203?

### Options:
A. To elevat

In [20]:
import csv

# Initialize counters for correct predictions and total predictions
correct_predictions = 0
total_predictions = 0

# File path (update if needed)
file_path = "/content/cti-mcq-with-output.tsv"

# Read the TSV file and compare GT with Generated Output
with open(file_path, mode='r',encoding='ISO-8859-1') as file:
    reader = csv.DictReader(file, delimiter='\t')

    for row in reader:
        total_predictions += 1
        if row['GT'].strip() == row['Generated Output'].strip():  # Compare GT and Generated Output
            correct_predictions += 1

# Calculate accuracy
accuracy = (correct_predictions / total_predictions) * 100

# Output the result
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 30.28%
