In [1]:
## Install required libraries
!pip install -q transformers datasets accelerate bitsandbytes peft trl torch
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q sentencepiece
!pip install -q llama-cpp-python
!pip install -q ctranslate2


In [None]:
import os
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer


In [None]:
MODEL_NAME = "unsloth/gemma-2-9b"  # Replace with the actual model name  {SicariusSicariiStuff/Phi-3.5-mini-instruct_Uncensored}
"""Models to be trained:
_________________________________
|                                |
|  1.google/gemma-2-9b           |
|  2.google/gemma-2-2b           |
|  3.google/gemma-2-2b-it        |
|  4.google/codegemma-2b         |
|  5.google/codegemma-7b-it      |
|                                |
|________________________________|
"""

# use_auth_token="hf-token_from_huggingface" 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,
                                        #   use_auth_token=""
                                        )
tokenizer.pad_token = tokenizer.eos_token

## Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    # use_auth_token=""
)

## Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

## LoRA configuration
peft_config = LoraConfig(
    r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

## Apply LoRA to the model
model = get_peft_model(model, peft_config)


In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples.get("instruction", examples.get("instruction",examples.get("system", [])))
    inputs = examples.get("input", examples.get("input",examples.get("command", [])))
    outputs = examples.get("output", examples.get("Output", examples.get("response", examples.get("Response", []))))
    # return instructions, inputs, outputs  # Remove this line causing the error
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts} # Return a dictionary as expected by map function

def standardize_columns(dataset):
    rename_dict = {
        "Response": "output",
        "response": "output",
        "Output": "output",
        "Input": "input",
        "Instruction": "instruction",
        "Instruction": "system"
    }
    return dataset.rename_columns({k: v for k, v in rename_dict.items() if k in dataset.column_names})


# Insert the Dataset repo_id below to load the dataset from the repo: 

datasets_to_load = [
  "ICEPVP8977/Debian_Hacking_Networking",
  "ICEPVP8977/Uncensored_mini",
]

def has_train_split(dataset_name):
    try:
        dataset_info = load_dataset(dataset_name, split=None)
        return 'train' in dataset_info.keys()
    except:
        return False

datasets_with_train_split = [dataset_name for dataset_name in datasets_to_load if has_train_split(dataset_name)]

datasets = []
for dataset_name in datasets_with_train_split:
    try:
        dataset = load_dataset(dataset_name, split="train")
        standardized_dataset = standardize_columns(dataset)

        required_columns = ["instruction", "input", "output"]
        if all(col in standardized_dataset.column_names for col in required_columns):
            datasets.append(standardized_dataset)
            print(f"Successfully loaded and standardized: {dataset_name}")
        else:
            print(f"Skipping {dataset_name}: Missing required columns")
    except Exception as e:
        print(f"Error loading {dataset_name}: {str(e)}")

combined_dataset = concatenate_datasets(datasets)

formatted_dataset = combined_dataset.map(formatting_prompts_func, batched=True, remove_columns=combined_dataset.column_names)

formatted_dataset = formatted_dataset.shuffle(seed=199)


In [None]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

# Define training arguments without max_seq_length
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,#Default = 3 - 4
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=2e-4,# For general and fast adaptation {5e-5} is generally recommended./-- For the model to reproduce the exact text from the datasets the learning rate {1e-5} or even lower.
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    push_to_hub=False,
)


In [None]:

# Use a data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set to False for causal language modeling
)


In [None]:

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    data_collator=data_collator,  # Use the data collator
    packing=False,
)

In [None]:
# Train the model
trainer.train()

In [None]:
## Save the final model
trainer.model.save_pretrained("./final_model")


In [None]:
model = trainer.model.merge_and_unload()

In [None]:
# Save the full model to 4bit
model.save_pretrained("./full_model")

In [None]:
# Save the tokenizer
tokenizer.save_pretrained("./full_model")