In [None]:
# Install Pytorch & other libraries
!pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  "trl==0.7.10" # \
  "peft==0.7.1" \

# # install peft & trl from github
# !pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
# !pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

In [None]:
# import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

from transformers import AutoTokenizer
from trl import SFTTrainer , setup_chat_format
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [None]:
from huggingface_hub import login
login("lmao aint fooling me",
      add_to_git_credential=True)


# Load the 20 Newsgroups dataset
dataset = load_dataset("SetFit/20_newsgroups")

# Split the dataset into train and test sets
train_data = dataset["train"]
test_data = dataset["test"]



# Load the tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.padding_side = "right"


max_token_length = 480

# Function to filter data by max token length
def filter_by_length(example):
    return len(tokenizer(example["text"])["input_ids"]) <= max_token_length

# Apply filtering
train_data = train_data.filter(filter_by_length)
test_data = test_data.filter(filter_by_length)
train_data = train_data.select(range(3000))
test_data = test_data.select(range(1000))

def make_prompt(example):
    categories_with_descriptions = """
1. alt.atheism - Discussions about atheism and religious skepticism
2. comp.graphics - Computer graphics, rendering, and visualization
3. comp.os.ms-windows.misc - Microsoft Windows operating system topics
4. comp.sys.ibm.pc.hardware - IBM PC compatible hardware discussions
5. comp.sys.mac.hardware - Apple Macintosh hardware discussions
6. comp.windows.x - X-Windows system for Unix discussions
7. misc.forsale - Items for sale or wanted
8. rec.autos - Automobile enthusiasts' discussions
9. rec.motorcycles - Motorcycle enthusiasts' discussions
10. rec.sport.baseball - Baseball discussions and news
11. rec.sport.hockey - Hockey discussions and news
12. sci.crypt - Cryptography and encryption discussions
13. sci.electronics - Electronics theory and practice
14. sci.med - Medicine and health-related discussions
15. sci.space - Space exploration and astronomy
16. soc.religion.christian - Christian faith and practice discussions
17. talk.politics.guns - Firearms legislation and rights
18. talk.politics.mideast - Middle East politics and current events
19. talk.politics.misc - General political discussions
20. talk.religion.misc - Discussions about various religions
"""

    icl_examples = """
Example 1:
Text: The new Mars rover has successfully landed and begun its mission to search for signs of ancient microbial life.
Category: sci.space

Example 2:
Text: I'm looking to upgrade my PC's graphics card. Any recommendations for a good mid-range option?
Category: comp.sys.ibm.pc.hardware

Example 3:
Text: The First Amendment protects freedom of speech, but there are still ongoing debates about its limits in certain contexts.
Category: talk.politics.misc
"""

    system_message = f"""You are a text classification expert. Your task is to classify the given text into one of the 20 newsgroup categories. The categories are:

{categories_with_descriptions}

Here are some examples of how to classify texts:

{icl_examples}
"""

    return{
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": example["text"]},
      {"role": "assistant", "content": example["label_text"]}
    ]
  }



# Prepare the dataset
train_data = train_data.map(make_prompt, remove_columns=train_data.column_names)
test_data = test_data.map(make_prompt, remove_columns=test_data.column_names)

print(f"Filtered train data size: {len(train_data)}")
print(f"Filtered test data size: {len(test_data)}")


In [None]:

train_data.to_json("train.json",orient="records")
test_data.to_json("test.json",orient = "records")

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation="flash_attention_2",
    trust_remote_code=True,
    quantization_config=bnb_config,

)


In [None]:
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model, tokenizer = setup_chat_format(model, tokenizer)

# Define LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules="all-linear", # Only the attention layers, can try including mlp
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get the PEFT model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from datasets import load_dataset

# Load jsonl data from disk
train_data = load_dataset("json", data_files="train.json",split = "train")
test_data = load_dataset("json", data_files="test.json",split="train")

In [None]:
print(model)

In [None]:
print(test_data["messages"])

In [None]:
# !pip install torch==2.0.0+cu117
# !pip install pytorch-lightning==1.9.4
# !pip install accelerate==0.21.0
# !pip install tokenizers==0.13.3
# !pip install transformers==4.26.1
# import torch._dynamo
# torch._dynamo.config.suppress_errors = True

# ! pip install wandb
# Set up training arguments
# For a resonable gpu
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     warmup_steps=100,
#     logging_dir="./logs",
#     logging_steps=10,
#     save_strategy="epoch",
#     learning_rate=2e-4,
#     fp16=True,
#     remove_unused_columns=False,
# )
# For this exp I am using the L4 GPU
from transformers import TrainingArguments

torch.set_grad_enabled(True)
model.gradient_checkpointing_enable()

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./Llama-7b-hf-20_newsgroups_full",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    tf32=True,
    bf16=True,
    optim="adamw_torch",
    max_grad_norm=0.3,
    logging_steps=10,
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    evaluation_strategy="steps",
    dataloader_num_workers=2,
    group_by_length=True,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id="Noodle-bg/Llama-7b-hf_20_newsgroups_full",
    gradient_checkpointing=True,)

# Create SFT Trainer
max_seq_length = 1024

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    },


)

with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
    trainer.train()

# Save the fine-tuned model
trainer.save_model("./Llama-7b-hf-20_newsgroups_full")
