In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"C:\Users\ungdu\Downloads\Chat_Mini\mini_data.csv")

# Display sample data
print(df.head())

                             Câu hỏi  \
0      Các quả có mùi vị như thế nào   
1  Các quả có hình dáng như thế nào    

                                         Câu trả lời  
0  Quả cam ngon. Quả táo dở. Quả chanh chua. Quả ...  
1  Quả cam có hình tròn. Quả táo có hình tròn, hơ...  


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

local_model_path = r"C:\Users\ungdu\Downloads\LLM_Test\LLM"
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import torch
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = "Câu hỏi: " + row["Câu hỏi"] + " Câu trả lời: " + row["Câu trả lời"]
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask

In [7]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(df, test_size=0.1)

In [8]:
from torch.utils.data import DataLoader

train_dataset = QADataset(train_data, tokenizer)
val_dataset = QADataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

In [9]:
device = torch.device('cpu')
model.to(device)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [11]:
# Define the number of epochs before calculating total_steps
num_epochs = 3  # You can change this value as needed

# Calculate total steps for the scheduler
total_steps = len(train_loader) * num_epochs

# Define the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [12]:
from tqdm import tqdm

def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [13]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            total_loss += loss.item()

    return total_loss / len(dataloader)

In [1]:
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader, optimizer, scheduler)
    val_loss = evaluate(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")

Epoch 1/3


NameError: name 'train' is not defined

In [None]:
model.save_pretrained('./vi_gemma_2b_rag_finetuned')
tokenizer.save_pretrained('./vi_gemma_2b_rag_finetuned')
print("Fine-tuned model saved!")

In [None]:
def generate_answer(question, model, tokenizer, max_length=100):
    model.eval()
    input_text = f"Câu hỏi: {question} Câu trả lời:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    # Generate response
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decode and return the output
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

# Test with a sample question
question = "What is the best fruit for health?"
answer = generate_answer(question, model, tokenizer)
print("Generated Answer:", answer)