In [22]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments
)
import pandas as pd
import re
from torch.utils.data import Dataset

In [23]:
# test see if GPU is ready
def check_gpu():
    if torch.cuda.is_available():
        print("CUDA is ready!")
        device = torch.cuda.get_device_name(0)
        print(f"{device} is ready!")
    else:
        print("CUDA is gone...")
      

In [24]:
check_gpu()

CUDA is ready!
NVIDIA GeForce RTX 4090 is ready!


In [25]:
# set the model info
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#dataset_name = "arxiv_papers"
new_model = "/project/models/NV-llama3.1-8b-Arxiv"
api_key = "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"

In [26]:
# Load your data
data = pd.read_csv("ml_papers.csv")
data = data.dropna(subset=['title', 'summary']).reset_index(drop=True)

In [27]:
data

Unnamed: 0,title,summary,pdf_url,arxiv_link
0,Gender Representation and Bias in Indian Civil...,This paper makes three key contributions. Firs...,http://arxiv.org/pdf/2409.12194v2,http://arxiv.org/abs/2409.12194v2
1,DynaMo: In-Domain Dynamics Pretraining for Vis...,Imitation learning has proven to be a powerful...,http://arxiv.org/pdf/2409.12192v1,http://arxiv.org/abs/2409.12192v1
2,Qwen2-VL: Enhancing Vision-Language Model's Pe...,"We present the Qwen2-VL Series, an advanced up...",http://arxiv.org/pdf/2409.12191v1,http://arxiv.org/abs/2409.12191v1
3,Massively Multi-Person 3D Human Motion Forecas...,Forecasting long-term 3D human motion is chall...,http://arxiv.org/pdf/2409.12189v1,http://arxiv.org/abs/2409.12189v1
4,SPECTER: An Instrument Concept for CMB Spectra...,Deviations of the cosmic microwave background ...,http://arxiv.org/pdf/2409.12188v1,http://arxiv.org/abs/2409.12188v1
...,...,...,...,...
75,SFDA-rPPG: Source-Free Domain Adaptive Remote ...,Remote Photoplethysmography (rPPG) is a non-co...,http://arxiv.org/pdf/2409.12040v1,http://arxiv.org/abs/2409.12040v1
76,Not-so-glass-like Caging and Fluctuations of a...,Simple active models of matter recapitulate co...,http://arxiv.org/pdf/2409.12037v1,http://arxiv.org/abs/2409.12037v1
77,Topological Deep Learning with State-Space Mod...,Graph Neural Networks based on the message-pas...,http://arxiv.org/pdf/2409.12033v1,http://arxiv.org/abs/2409.12033v1
78,PhysMamba: Efficient Remote Physiological Meas...,Facial-video based Remote photoplethysmography...,http://arxiv.org/pdf/2409.12031v1,http://arxiv.org/abs/2409.12031v1


In [28]:
# Function to extract topics from titles
def extract_topic(title):
    title = re.sub(r"\(.*?\)|\[.*?\]", "", title)
    title = re.sub(r'[^\w\s]', '', title)
    title = title.lower()
    return title.strip()

# Generate user queries
def generate_user_query(topic):
    return f"I'm looking for papers discussing {topic}."

# Create assistant responses
def create_assistant_response(row):
    title = row['title']
    summary = row['summary']
    response = f"One paper that discusses this topic is '{title}'. {summary}"
    return response

In [29]:
data['topic'] = data['title'].apply(extract_topic)
data['instruction'] = data['topic'].apply(generate_user_query)

# generate assistant responses
data['response'] = data.apply(create_assistant_response, axis=1)

In [32]:
data["response"][1]

"One paper that discusses this topic is 'DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control'. Imitation learning has proven to be a powerful tool for training complex\nvisuomotor policies. However, current methods often require hundreds to\nthousands of expert demonstrations to handle high-dimensional visual\nobservations. A key reason for this poor data efficiency is that visual\nrepresentations are predominantly either pretrained on out-of-domain data or\ntrained directly through a behavior cloning objective. In this work, we present\nDynaMo, a new in-domain, self-supervised method for learning visual\nrepresentations. Given a set of expert demonstrations, we jointly learn a\nlatent inverse dynamics model and a forward dynamics model over a sequence of\nimage embeddings, predicting the next frame in latent space, without\naugmentations, contrastive sampling, or access to ground truth actions.\nImportantly, DynaMo does not require any out-of-domain data such as Internet\nd

In [10]:
# Define special tokens
bos_token = "<bos>"
eos_token = "<eos>"
user_start = "<user>"
user_end = "</user>"
assistant_start = "<assistant>"
assistant_end = "</assistant>"
pad_token = "<pad>"

In [11]:
# Format examples
def format_example(instruction, response):
    return f"{bos_token}\n{user_start}\n{instruction}\n{user_end}\n{assistant_start}\n{response}\n{assistant_end}\n{eos_token}"

In [12]:
data['text'] = data.apply(lambda row: format_example(row['instruction'], row['response']), axis=1)

In [7]:
# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

In [8]:
# import the model and configure it
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key, 
)

special_tokens = {
    'bos_token': bos_token,
    'eos_token': eos_token,
    'pad_token': pad_token,
    'additional_special_tokens': [user_start, user_end, assistant_start, assistant_end]
}

tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = pad_token  # Set pad_token to the unique pad_token

# Set pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             token=api_key, 
                                             quantization_config=bnb_config,
                                             cache_dir="/project/models",
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
# Update model embeddings
model.resize_token_embeddings(len(tokenizer))

Embedding(128263, 4096)

In [15]:
# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(model, lora_config)

In [17]:
# Tokenize data
tokenized_data = tokenizer(
    data['text'].tolist(),
    padding='longest',
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [18]:
# Create dataset
class PapersDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['input_ids'].clone()

        assistant_start_id = tokenizer.convert_tokens_to_ids(assistant_start)
        assistant_end_id = tokenizer.convert_tokens_to_ids(assistant_end)

        for i in range(len(self.labels)):
            input_ids = self.input_ids[i]
            labels = self.labels[i]

            assistant_start_positions = (input_ids == assistant_start_id).nonzero(as_tuple=True)[0]
            assistant_end_positions = (input_ids == assistant_end_id).nonzero(as_tuple=True)[0]

            if len(assistant_start_positions) > 0 and len(assistant_end_positions) > 0:
                assistant_start_pos = assistant_start_positions[0]
                assistant_end_pos = assistant_end_positions[0]

                labels[:assistant_start_pos + 1] = -100
                labels[assistant_end_pos:] = -100
            else:
                labels[:] = -100

            self.labels[i] = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [19]:
dataset = PapersDataset(tokenized_data)

In [20]:
dataset

<__main__.PapersDataset at 0x7f0d946c3cd0>

In [20]:
# Training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=True,
    learning_rate=2e-5,
    save_total_limit=2,
    save_steps=500,
    logging_steps=50,
    max_steps = 100,
    eval_strategy="no",
    report_to="none"
)

In [21]:
# arguments setting for 1 RTX 4090
from transformers import Trainer, TrainingArguments

training_arguments = TrainingArguments(
    output_dir="/project/models/NV-arxiv-llama3.1",             # Where to save results
    num_train_epochs=3,                 # Number of epochs
    per_device_train_batch_size=2,      # Start with 2, adjust based on memory
    gradient_accumulation_steps=5,      # Accumulate gradients to simulate larger batch size
    fp16=True,                         # Use FP16 for memory efficiency on RTX 4090
    gradient_checkpointing=True,        # Enable gradient checkpointing to save memory
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2e-5,                 # Adjust learning rate for fine-tuning
    max_grad_norm=0.3,                  # Gradient clipping
    weight_decay=0.001,                 # Regularization
    optim="adamw_torch",                      # Use standard AdamW optimizer
    max_steps=1500,                      # Train for 500 steps
    warmup_ratio=0.03,                  # Warmup learning rate
    group_by_length=True,               # Group sequences of similar lengths to save memory
    save_steps=100,                     # Save model checkpoint every 100 steps
    logging_steps=5,                    # Log training progress every 5 steps
    report_to="none"
    
)

In [23]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    tokenizer=tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [24]:
# Train the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
5,2.227
10,2.273
15,2.2007
20,2.2083
25,2.1842
30,2.1733
35,2.1655
40,2.0888
45,2.0192
50,2.0371



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]

Cannot access gated r

TrainOutput(global_step=1500, training_loss=0.4531424058731645, metrics={'train_runtime': 4780.0427, 'train_samples_per_second': 3.138, 'train_steps_per_second': 0.314, 'total_flos': 3.4645638905856e+17, 'train_loss': 0.4531424058731645, 'epoch': 187.5})

In [10]:
from transformers import AutoTokenizer

# Define your special tokens
bos_token = "<bos>"
eos_token = "<eos>"
pad_token = "<pad>"
user_start = "<user>"
user_end = "</user>"
assistant_start = "<assistant>"
assistant_end = "</assistant>"

special_tokens = {
    'bos_token': bos_token,
    'eos_token': eos_token,
    'pad_token': pad_token,
    'additional_special_tokens': [user_start, user_end, assistant_start, assistant_end]
}

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key
)

# Add special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = pad_token

In [11]:
from transformers import AutoModelForCausalLM

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    token=api_key,
    quantization_config=bnb_config,
    cache_dir="/project/models",
    device_map="auto"
)

# Update model's embeddings to accommodate new tokens
base_model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Embedding(128263, 4096)

In [12]:
# Load the LoRA adapter weights
model = PeftModel.from_pretrained(
    base_model,
    "/project/models/arxiv_model",
    device_map="auto"
).to("cuda")

In [39]:
# Define the format_example function
def format_example(instruction, response=""):
    return f"{bos_token}\n{user_start}\n{instruction}\n{user_end}\n{assistant_start}\n{response}"

# Prepare the input
instruction = "I am looking for a paper discussing To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning"
input_text = format_example(instruction)

In [40]:
# Tokenize the input
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=512,
    padding=True
).to("cuda")

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]


In [41]:
# Generate the response
with torch.no_grad():
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.2,
        num_return_sequences=1,
        eos_token_id=tokenizer.convert_tokens_to_ids(eos_token),
        pad_token_id=tokenizer.convert_tokens_to_ids(pad_token)
    )

# Decode and extract the assistant's response
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)

print(generated_text)

<|begin_of_text|><bos>
<user>
I am looking for a paper discussing To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning
</user>
<assistant>
I am looking for a paper discussing 'To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning'
 YYS
One paper that discusses this topic is 'To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic reasoning'. Recent large language models have demonstrated impressive mathematical
and logical reasoning capabilities. However, the complexity of natural math
and logic problems often prevents models from finding an exact solution,
leading them to opt for heuristic search strategies over an exhaustive branch
exploration. In this work, we investigate the impact of the chain-of-thought
(cot) curriculum on large language model's math and logic performance.
Surprisingly, we find that the introduction of cot enhances model performance
across a wide range of math and logic challenges, despit

In [40]:
# List all parameters, including their requires_grad status
for name, param in model.named_parameters():
    print(f"Parameter: {name}, Requires Grad: {param.requires_grad}, Shape: {param.shape}")

Parameter: base_model.model.model.embed_tokens.weight, Requires Grad: False, Shape: torch.Size([128263, 4096])
Parameter: base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight, Requires Grad: False, Shape: torch.Size([8388608, 1])
Parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight, Requires Grad: False, Shape: torch.Size([8, 4096])
Parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight, Requires Grad: False, Shape: torch.Size([4096, 8])
Parameter: base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight, Requires Grad: False, Shape: torch.Size([2097152, 1])
Parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight, Requires Grad: False, Shape: torch.Size([8, 4096])
Parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight, Requires Grad: False, Shape: torch.Size([1024, 8])
Parameter: base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight, Requires

In [16]:
# Save the LoRA adapter weights
model.save_pretrained("/project/models/arxiv_model")


Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.


In [17]:
# Inference
# Load the base model and tokenizer with special tokens
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key
)
tokenizer.add_special_tokens(special_tokens)
tokenizer.pad_token = pad_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    token=api_key,
    quantization_config=bnb_config,
    cache_dir="/project/models",
    device_map="auto"
)

# Update model's embeddings
base_model.resize_token_embeddings(len(tokenizer))

peft_model = PeftModel.from_pretrained(base_model, "/project/models/arxiv_model").to("cuda")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
# Prepare the input
instruction = "I'm looking for papers discussing You Only Read Once(YORO)."
input_text = format_example(instruction, "")  # Empty assistant response

In [19]:
# Tokenize the input
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=512,
    padding = True,
).to("cuda")

input_ids = inputs.input_ids
attention_mask = inputs.attention_mask

In [21]:
# Generate the response
with torch.no_grad():
    output = peft_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=5,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        repetition_penalty=1.5,
        num_return_sequences=1,
        eos_token_id=tokenizer.convert_tokens_to_ids(eos_token),
        pad_token_id=tokenizer.convert_tokens_to_ids(pad_token)
    )

# Decode and extract the assistant's response
generated_text = tokenizer.decode(output[0], skip_special_tokens=False)

print(generated_text)

<|begin_of_text|><bos>
<user>
I'm looking for papers discussing You Only Read Once(YORO).
</user>
<assistant>
One paper that discusses this topic is 'You Only Read Once (YORO): A New Intuitive
Transformer Baseline for Long-Range Language Modeling'. The common practice in
long-range language modeling is to multiply the length of the input by the
number of tokens using a separator, which significantly hampers the
performance of transformer-based models. To address this issue, we propose
You Only Read Once (YORO), a simple transformation that fuses all tokens into
a single input word by adding a learnable vector to the token embeddings.
YORO does not require any changes to the underlying model architecture and can
be applied to both encoder-decoders and encoders-only models. We evaluate
YORO on a wide range of language modeling tasks, spanning both finite and
infinite vocabulary corpora. Our results show that YORO consistently
significantly improves the performance and efficiency of trans

In [29]:
# Get an example from your dataset
example = dataset[0]
input_ids = example['input_ids']
labels = example['labels']

# Convert token IDs back to tokens for readability
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
label_tokens = [
    tokenizer.convert_ids_to_tokens([id])[0] if id != -100 else '<mask>'
    for id in labels
]

# Print tokens side by side
for input_token, label_token in zip(input_tokens, label_tokens):
    print(f"Input Token: {input_token}, Label Token: {label_token}")

Input Token: <|begin_of_text|>, Label Token: <mask>
Input Token: <bos>, Label Token: <mask>
Input Token: Ċ, Label Token: <mask>
Input Token: <user>, Label Token: <mask>
Input Token: Ċ, Label Token: <mask>
Input Token: I, Label Token: <mask>
Input Token: 'm, Label Token: <mask>
Input Token: Ġlooking, Label Token: <mask>
Input Token: Ġfor, Label Token: <mask>
Input Token: Ġpapers, Label Token: <mask>
Input Token: Ġdiscussing, Label Token: <mask>
Input Token: Ġagents, Label Token: <mask>
Input Token: Ġin, Label Token: <mask>
Input Token: Ġsoftware, Label Token: <mask>
Input Token: Ġengineering, Label Token: <mask>
Input Token: Ġsurvey, Label Token: <mask>
Input Token: Ġlandscape, Label Token: <mask>
Input Token: Ġand, Label Token: <mask>
Input Token: Ġvision, Label Token: <mask>
Input Token: .Ċ, Label Token: <mask>
Input Token: </user>, Label Token: <mask>
Input Token: Ċ, Label Token: <mask>
Input Token: <assistant>, Label Token: <mask>
Input Token: Ċ, Label Token: Ċ
Input Token: One, Lab

In [18]:
# Check trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print("Check")
        print(f"Trainable parameter: {name}, Shape: {param.shape}")

Check
Trainable parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight, Shape: torch.Size([8, 4096])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight, Shape: torch.Size([4096, 8])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight, Shape: torch.Size([8, 4096])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight, Shape: torch.Size([1024, 8])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight, Shape: torch.Size([8, 4096])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight, Shape: torch.Size([1024, 8])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight, Shape: torch.Size([8, 4096])
Check
Trainable parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight, Shape

In [22]:
# this is important correct one
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    token=api_key,
    quantization_config=bnb_config,
    cache_dir="/project/models",
    device_map="auto"
)

# Update model's embeddings
model.resize_token_embeddings(len(tokenizer))

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Ensure these modules exist in your model
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [38]:
for name, module in model.named_modules():
    print(name)


base_model
base_model.model
base_model.model.model
base_model.model.model.embed_tokens
base_model.model.model.layers
base_model.model.model.layers.0
base_model.model.model.layers.0.self_attn
base_model.model.model.layers.0.self_attn.q_proj
base_model.model.model.layers.0.self_attn.q_proj.base_layer
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default
base_model.model.model.layers.0.self_attn.q_proj.lora_A
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default
base_model.model.model.layers.0.self_attn.q_proj.lora_B
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_A
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_B
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector
base_model.model.model.layers.0.self_attn.k_proj
base_model.model.model.layers.0.self_attn.v_proj
base_model.model.model.lay