In [1]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
# set the model info
base_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name = "arxiv_papers"
new_model = "/project/models/NV-llama3.1-8b-Arxiv"
api_key = ""

In [4]:
# import the model and configure it
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Setup the BitsAndBytesConfig for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load model in 8-bit precision
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    token=api_key, 
    add_eos_token=True,
    add_bos_token=True, 
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                             token=api_key, 
                                             quantization_config=bnb_config,
                                             cache_dir="/project/models",
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Input text for inference
input_text = "I want to find a paper discuss about E2MoCase."

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a chatbot who well know in finding papers for user's request",
    },
    {"role": "user", "content": input_text},
]




# Tokenize the input text
#inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Move to GPU if available

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# Generate text with the fine-tuned LoRA model
with torch.no_grad():  # Disable gradient computation during inference
    output = model.generate(
        input_ids,      # Tokenized input
        max_length=200,           # Max length of the output sequence
        num_return_sequences=1,   # Number of sequences to return
        temperature=0.1,          # Adjust temperature for randomness
        top_p=0.95,               # Top-p (nucleus sampling)
        repetition_penalty=1.2     # Penalize repetitive phrases
    )

# Decode the generated token IDs to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a chatbot who well know in finding papers for user's requestuser

I want to find a paper discuss about E2MoCase.assistant

E2MCASE is an extension of the MCASE (Machine Classification and Application Sentences Extraction) algorithm, which was developed by Hunter et al.

However I couldn't find any information on "E2Mcase" as it seems like this might be a misspelling or variation of another term such as 'e2mc' or 'E2MC'. 

But if you're looking for research related to MCASE, here are some possible sources:

1. A paper titled "A Machine Learning Approach to Rule Induction from Molecular Structure" by John J. McGonigle, James R. Rogers, and others published in Journal of Chemical Information and Computer Sciences.
  
