In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {}".format(device))

model_name = "/projects/llm-repo/models/Qwen/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c540970f9e29518b1d8f06ab8b24cba66ad77b6d"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code = True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)

print(model.device)

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

with torch.no_grad():
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=500,
    )
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  from .autonotebook import tqdm as notebook_tqdm


Using cuda


2025-06-22 07:27:15.688416: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-22 07:27:15.699631: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750577235.713625   14952 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750577235.717641   14952 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750577235.728170   14952 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

[2025-06-22 07:27:17,489] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/envs/py311/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/envs/py311/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/opt/conda/envs/py311/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/opt/conda/envs/py311/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/opt/conda/envs/py311/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/opt/conda/envs/py311/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/opt/conda/envs/py311/compiler_compat/ld: /opt/conda/envs/py311/bin/../x86_64-conda-linux-gnu/sy

[2025-06-22 07:27:18,553] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


cuda:0


In [2]:
print(response)

Large Language Models (LLMs) are artificial intelligence systems that can generate human-like text from unstructured or semi-structured textual data such as natural language, web texts, emails, and more. These models are capable of understanding complex patterns and relationships in the data they process, which makes them useful for tasks like text summarization, question answering, language translation, sentiment analysis, and more.

LLMs have been around since the 1950s when the first machine learning algorithms were developed. However, it wasn't until the late 20th century with the development of deep neural networks that these models became widely used in various fields like computer science, medicine, finance, and more. Today, large language models are considered the gold standard for many AI applications due to their ability to generate high-quality text with rich semantic content.

In terms of performance, LLMs have significantly improved over time, making them capable of genera

In [3]:
from datasets import load_from_disk
from trl import GRPOConfig, GRPOTrainer

dataset = load_from_disk("/projects/llm-repo/datasets/trl-lib/tldr")

# Define the reward function, which rewards completions that are close to 20 characters
def reward_len(completions, **kwargs):
    rewards = [-abs(20 - len(completion)) for completion in completions]
    rewards = torch.tensor(rewards, dtype=torch.float16)  # Convert to tensor
    if torch.isnan(rewards).any() or torch.isinf(rewards).any():
        print("NaN or Inf detected in rewards!")
        # Optionally, print the completions that caused the issue
        print(completions)
    return rewards

INFO 06-22 07:27:24 [__init__.py:244] Automatically detected platform cuda.


2025-06-22 07:27:27,452	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [4]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [5]:
## Emptying the cuda cache
torch.cuda.empty_cache()

In [6]:
from peft import LoraConfig, get_peft_model

# Define a LoRA configuration. Adjust target_modules based on your model architecture.
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # adjust these names for your model if needed
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Attach the adapters to the quantized model.
model = get_peft_model(model, lora_config)

In [7]:
# from transformers import GenerationConfig

per_device_train_batch_size = 2
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_args = GRPOConfig(
    output_dir="Qwen2.5-0.5B-GRPO", 
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps= gradient_accumulation_steps,
    # optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    # learning_rate=learning_rate,
    bf16=True,
    # deepspeed="deepspeed_config.json",  # Enable DeepSpeed
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    # warmup_ratio=warmup_ratio,
    # group_by_length=True,
    # lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=False,
    num_generations=2  # Add this line
    )

## compiling the model
# model = torch.compile(model)

trainer = GRPOTrainer(
    # model="Qwen/Qwen2.5-0.5B",
    model = model,
    reward_funcs=reward_len,
    args=training_args,
    train_dataset=dataset
    # generation_config=GenerationConfig(do_sample=False, temperature=1.0, top_p=1.0) # Experiment with these
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()

Step,Training Loss
10,0.0158
20,0.0408
30,0.0021
40,0.0227
50,0.0337
60,0.034
70,0.0207
80,0.0297
90,0.0132
100,0.0334


TrainOutput(global_step=500, training_loss=0.027494922786951065, metrics={'train_runtime': 4549.8108, 'train_samples_per_second': 1.758, 'train_steps_per_second': 0.11, 'total_flos': 0.0, 'train_loss': 0.027494922786951065})

# Inferencing

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {}".format(device))

model_name = "/projects/llm-repo/models/Qwen/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c540970f9e29518b1d8f06ab8b24cba66ad77b6d"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code = True
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned weights (LoRA adapters)
checkpoint_dir = "/projects/neural-alchemists-ftf-hackathon/neural-alchemists-ftf-hackathon/Qwen2.5-0.5B-GRPO/checkpoint-500"
model = PeftModel.from_pretrained(model, checkpoint_dir)

# Move the model to the appropriate device
model = model.to(device)

# Test the fine-tuned model
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

# Generate output
with torch.no_grad():
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=500,
    )

generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode the generated output
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)

Using cuda
A Large Language Model is a type of artificial intelligence system that can generate human-like text based on the input sentences given by a user. These systems are capable of understanding and generating complex natural language sentences with the ability to adapt their responses to different contexts and inputs. This capability makes them particularly useful for tasks such as question answering, summarization, and generation of text from text data.
