In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [2]:
# test see if GPU is ready
def check_gpu():
    if torch.cuda.is_available():
        print("CUDA is ready!")
        device = torch.cuda.get_device_name(0)
        print(f"{device} is ready!")
    else:
        print("CUDA is gone...")
        

In [3]:
check_gpu()

CUDA is ready!
NVIDIA GeForce RTX 4090 is ready!


In [3]:
torch.device("cpu")

device(type='cpu')

In [4]:
# import the papers data
import pandas as pd

data = pd.read_csv("ml_papers.csv")
# Combine title and summary for each paper
data['input_text'] = data['title'] + ": " + data['summary']

# Select the input text for fine-tuning
texts = data['input_text'].tolist()

Solution: Reformat your training data to include instruction-response pairs that mimic the conversational context you expect during inference. For example:

- User Prompt (Instruction): “Tell me about SGFormer (Single-Layer Graph) for Transformers.”
- Assistant Response: [Summary of the paper].

By structuring your data this way, you’re training the model to generate responses to specific user inputs.

In [None]:
# set the model info
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#dataset_name = "arxiv_papers"
new_model = "/project/models/NV-llama3.1-8b-Arxiv"
api_key = "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"

In [6]:
# import the model and configure it
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key, 
    add_eos_token=True,
    add_bos_token=True, 
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             token=api_key, 
                                             quantization_config=bnb_config,
                                             cache_dir="/project/models",
                                             device_map="auto")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
from peft import LoraConfig, get_peft_model

# Define LoRA config
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank decomposition (adjustable, smaller uses less memory)
    lora_alpha=32,  # Scaling factor for LoRA
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to specific layers (e.g., query and value projections)
    lora_dropout=0.1,  # Dropout to prevent overfitting
    bias="none",  # Disable bias in LoRA
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Print the number of trainable parameters
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [8]:
model.hf_device_map

{'': 0}

In [9]:
# tokenize the data to token
tokenized_data = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [10]:
from torch.utils.data import Dataset, DataLoader

# Create the dataset with the tokenized data
class PapersDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.input_ids[idx].clone()
        }

dataset = PapersDataset(tokenized_data)

# DataLoader for batching
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [12]:
# Define the optimizer for the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [11]:
# arguments setting for 1 RTX 4090
from transformers import Trainer, TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./results",             # Where to save results
    num_train_epochs=1,                 # Number of epochs
    per_device_train_batch_size=3,      # Start with 2, adjust based on memory
    gradient_accumulation_steps=5,      # Accumulate gradients to simulate larger batch size
    fp16=True,                         # Use FP16 for memory efficiency on RTX 4090
    gradient_checkpointing=True,        # Enable gradient checkpointing to save memory
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2e-5,                 # Adjust learning rate for fine-tuning
    max_grad_norm=0.3,                  # Gradient clipping
    weight_decay=0.001,                 # Regularization
    optim="adamw_torch",                      # Use standard AdamW optimizer
    max_steps=1500,                      # Train for 500 steps
    warmup_ratio=0.03,                  # Warmup learning rate
    group_by_length=True,               # Group sequences of similar lengths to save memory
    save_steps=100,                     # Save model checkpoint every 100 steps
    logging_steps=5,                    # Log training progress every 5 steps
    report_to="none"
    
)



In [13]:
from accelerate import Accelerator

# Initialize the Accelerator to manage devices
accelerator = Accelerator()

# Prepare the model, optimizer, and dataset
model = accelerator.prepare_model(
    model
)

In [12]:

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,              # Your prepared dataset
    tokenizer=tokenizer,
)



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
5,5.8973
10,6.1595
15,5.9957
20,5.6969
25,5.6916
30,5.3555
35,4.993
40,4.535
45,3.9814
50,3.3008



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]

Cannot access gated r

TrainOutput(global_step=1500, training_loss=0.3495752413223187, metrics={'train_runtime': 3626.8458, 'train_samples_per_second': 6.204, 'train_steps_per_second': 0.414, 'total_flos': 4.0449269002663526e+17, 'train_loss': 0.3495752413223187, 'epoch': 1071.4285714285713})

In [22]:
torch.cuda.current_device()

0

In [15]:
trainer.save_state()

In [16]:
# Save the LoRA adapter weights
model.save_pretrained("./arxiv_model")


Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.


In [17]:
# Load the base model from a local directory, not Hugging Face
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             token=api_key, 
                                             quantization_config=bnb_config,
                                             cache_dir="/project/models",
                                             device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key, 
    add_eos_token=True,
    add_bos_token=True, 
)

# Load the LoRA adapter weights
peft_model = PeftModel.from_pretrained(base_model, "./arxiv_model").to("cuda")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Input text for inference
input_text = "I want to find a paper discuss about SGFormer(Single-Layer Graph) for Transformers."

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a chatbot who well know in finding papers for user's request",
    },
    {"role": "user", "content": input_text},
]




# Tokenize the input text
#inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Move to GPU if available

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Generate text with the fine-tuned LoRA model
with torch.no_grad():  # Disable gradient computation during inference
    output = peft_model.generate(
        input_ids,      # Tokenized input
        max_length=200,           # Max length of the output sequence
        num_return_sequences=1,   # Number of sequences to return
        temperature=0.1,          # Adjust temperature for randomness
        top_p=0.95,               # Top-p (nucleus sampling)
        repetition_penalty=1.2     # Penalize repetitive phrases
    )

# Decode the generated token IDs to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a chatbot who well know in finding papers for user's requestuser

I want to find a paper discuss about SGFormer(Single-Layer Graph) for Transformers.assistant

I can help you find a paper related to SGFormer (Single-Lied Graph Former with Default Hyperparameters on an average computer for transforming networks to be represented as single-layer graphs, especially in machine learning contexts).

To find the paper, I conduct a search through the available databases.

Results:

1 :: Learned Graph Representations Versus Fixed-Sized Word Embeddings
This work is referenced by Paper: 150-word word embeddings represent word differences better than angular regularized label prediction problems:\ Retrieved paper: Learning Graph Transformers with Local Interaction: Exploring Single-Head GFT with Small-Medium sized gangs of ants on a single-graph: Held: SG
