In [35]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, PeftModel
import pandas as pd
import re
from torch.utils.data import Dataset

In [36]:
# test see if GPU is ready
def check_gpu():
    if torch.cuda.is_available():
        print("CUDA is ready!")
        device = torch.cuda.get_device_name(0)
        print(f"{device} is ready!")
    else:
        print("CUDA is gone...")
      

In [37]:
check_gpu()

CUDA is ready!
NVIDIA GeForce RTX 4090 is ready!


In [38]:
# set the model info
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#dataset_name = "arxiv_papers"
new_model = "/project/models/NV-llama3.1-8b-Arxiv"
api_key = "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"

In [39]:
# Load your data
data = pd.read_csv("ml_papers.csv")
data = data.dropna(subset=['title', 'summary']).reset_index(drop=True)

In [40]:
data

Unnamed: 0,title,summary,pdf_url,arxiv_link
0,"Agents in Software Engineering: Survey, Landsc...","In recent years, Large Language Models (LLMs) ...",http://arxiv.org/pdf/2409.09030v1,http://arxiv.org/abs/2409.09030v1
1,Learning Theory Informed Priors for Bayesian I...,Cosmological models are often motivated and fo...,http://arxiv.org/pdf/2409.09029v1,http://arxiv.org/abs/2409.09029v1
2,Towards Leveraging Contrastively Pretrained Ne...,Music recommender systems frequently utilize n...,http://arxiv.org/pdf/2409.09026v1,http://arxiv.org/abs/2409.09026v1
3,INN-PAR: Invertible Neural Network for PPG to ...,Non-invasive and continuous blood pressure (BP...,http://arxiv.org/pdf/2409.09021v1,http://arxiv.org/abs/2409.09021v1
4,An Efficient and Streaming Audio Visual Active...,This paper delves into the challenging task of...,http://arxiv.org/pdf/2409.09018v1,http://arxiv.org/abs/2409.09018v1
5,Closed-Loop Visuomotor Control with Generative...,Despite significant progress in robotics and e...,http://arxiv.org/pdf/2409.09016v1,http://arxiv.org/abs/2409.09016v1
6,AI-LieDar: Examine the Trade-off Between Utili...,"To be safely and successfully deployed, LLMs m...",http://arxiv.org/pdf/2409.09013v1,http://arxiv.org/abs/2409.09013v1
7,VAE Explainer: Supplement Learning Variational...,Variational Autoencoders are widespread in Mac...,http://arxiv.org/pdf/2409.09011v1,http://arxiv.org/abs/2409.09011v1
8,Contri(e)ve: Context + Retrieve for Scholarly ...,Scholarly communication is a rapid growing fie...,http://arxiv.org/pdf/2409.09010v1,http://arxiv.org/abs/2409.09010v1
9,Optimizing Rare Word Accuracy in Direct Speech...,Direct speech translation (ST) models often st...,http://arxiv.org/pdf/2409.09009v1,http://arxiv.org/abs/2409.09009v1


In [41]:
# Function to extract topics from titles
def extract_topic(title):
    title = re.sub(r"\(.*?\)|\[.*?\]", "", title)
    title = re.sub(r'[^\w\s]', '', title)
    title = title.lower()
    return title.strip()

# Generate user queries
def generate_user_query(topic):
    return f"I'm looking for papers discussing {topic}."

# Create assistant responses
def create_assistant_response(row):
    title = row['title']
    summary = row['summary']
    response = f"One paper that discusses this topic is '{title}'. {summary}"
    return response

In [42]:
data['topic'] = data['title'].apply(extract_topic)
data['instruction'] = data['topic'].apply(generate_user_query)

# generate assistant responses
data['response'] = data.apply(create_assistant_response, axis=1)

In [43]:
data.head()

Unnamed: 0,title,summary,pdf_url,arxiv_link,topic,instruction,response
0,"Agents in Software Engineering: Survey, Landsc...","In recent years, Large Language Models (LLMs) ...",http://arxiv.org/pdf/2409.09030v1,http://arxiv.org/abs/2409.09030v1,agents in software engineering survey landscap...,I'm looking for papers discussing agents in so...,One paper that discusses this topic is 'Agents...
1,Learning Theory Informed Priors for Bayesian I...,Cosmological models are often motivated and fo...,http://arxiv.org/pdf/2409.09029v1,http://arxiv.org/abs/2409.09029v1,learning theory informed priors for bayesian i...,I'm looking for papers discussing learning the...,One paper that discusses this topic is 'Learni...
2,Towards Leveraging Contrastively Pretrained Ne...,Music recommender systems frequently utilize n...,http://arxiv.org/pdf/2409.09026v1,http://arxiv.org/abs/2409.09026v1,towards leveraging contrastively pretrained ne...,I'm looking for papers discussing towards leve...,One paper that discusses this topic is 'Toward...
3,INN-PAR: Invertible Neural Network for PPG to ...,Non-invasive and continuous blood pressure (BP...,http://arxiv.org/pdf/2409.09021v1,http://arxiv.org/abs/2409.09021v1,innpar invertible neural network for ppg to ab...,I'm looking for papers discussing innpar inver...,One paper that discusses this topic is 'INN-PA...
4,An Efficient and Streaming Audio Visual Active...,This paper delves into the challenging task of...,http://arxiv.org/pdf/2409.09018v1,http://arxiv.org/abs/2409.09018v1,an efficient and streaming audio visual active...,I'm looking for papers discussing an efficient...,One paper that discusses this topic is 'An Eff...


In [44]:
# Define special tokens
bos_token = "<bos>"
eos_token = "<eos>"
user_start = "<user>"
user_end = "</user>"
assistant_start = "<assistant>"
assistant_end = "</assistant>"

In [45]:
# Format examples
def format_example(instruction, response):
    return f"{bos_token}\n{user_start}\n{instruction}\n{user_end}\n{assistant_start}\n{response}\n{assistant_end}\n{eos_token}"

In [46]:
data['text'] = data.apply(lambda row: format_example(row['instruction'], row['response']), axis=1)

In [47]:
# import the model and configure it
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key, 
)

special_tokens = {
    'bos_token': bos_token,
    'eos_token': eos_token,
    'additional_special_tokens': [user_start, user_end, assistant_start, assistant_end]
}

tokenizer.add_special_tokens(special_tokens)

# Set pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             token=api_key, 
                                             quantization_config=bnb_config,
                                             cache_dir="/project/models",
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [48]:
# Update model embeddings
model.resize_token_embeddings(len(tokenizer))

Embedding(128262, 4096)

In [49]:
# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(model, lora_config)

In [50]:
# Tokenize data
tokenized_data = tokenizer(
    data['text'].tolist(),
    padding='longest',
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [51]:
# Create dataset
class PapersDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['input_ids'].clone()

        assistant_start_id = tokenizer.convert_tokens_to_ids(assistant_start)
        assistant_end_id = tokenizer.convert_tokens_to_ids(assistant_end)

        for i in range(len(self.labels)):
            input_ids = self.input_ids[i]
            labels = self.labels[i]

            assistant_start_positions = (input_ids == assistant_start_id).nonzero(as_tuple=True)[0]
            assistant_end_positions = (input_ids == assistant_end_id).nonzero(as_tuple=True)[0]

            if len(assistant_start_positions) > 0 and len(assistant_end_positions) > 0:
                assistant_start_pos = assistant_start_positions[0]
                assistant_end_pos = assistant_end_positions[0]

                labels[:assistant_start_pos + 1] = -100
                labels[assistant_end_pos:] = -100
            else:
                labels[:] = -100

            self.labels[i] = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [52]:
dataset = PapersDataset(tokenized_data)

In [53]:
dataset

<__main__.PapersDataset at 0x7f87983044f0>

In [54]:
# Training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    fp16=True,
    learning_rate=2e-5,
    save_total_limit=2,
    save_steps=500,
    logging_steps=50,
    eval_strategy="no",
    report_to="none"
)

In [55]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset,
    tokenizer=tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
