In [14]:
!pip install transformers peft datasets accelerate bitsandbytes



In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

In [16]:
from huggingface_hub import login
from dotenv import load_dotenv
import os


# Load environment variables from .env
load_dotenv()

# Get the Hugging Face Token
token = os.getenv("HUGGINGFACE_TOKEN")

# Use the token in your script
print(f"Hugging Face Token: {token[:4]}********")  # Mask for security
login(token=token)

Hugging Face Token: hf_B********


In [17]:
def load_and_prepare_dataset(dataset_path):
    """
    Load the dataset from a CSV file.
    """
    dataset = load_dataset("csv", data_files=dataset_path)
    return dataset

def tokenize_dataset(dataset, tokenizer):
    """
    Tokenize the dataset for causal language modeling.
    """
    def tokenize_function(examples):
        return tokenizer(examples["TEXT"], padding="max_length", truncation=True, max_length=512)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [18]:
def load_model_and_tokenizer(model_name, use_lora=True):
    """
    Load the Google Gemma model and tokenizer with optional LoRA fine-tuning.
    """
    # Quantization configuration for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Load model in 4-bit precision
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token="your_huggingface_token")

    # Load the model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        token="your_huggingface_token",
    )

    # Prepare the model for LoRA fine-tuning
    if use_lora:
        model = prepare_model_for_kbit_training(model)

        # Define LoRA configuration
        lora_config = LoraConfig(
            r=8,  # Rank of the low-rank matrices
            lora_alpha=32,  # Scaling factor
            target_modules=["q_proj", "v_proj"],  # Target attention layers
            lora_dropout=0.1,  # Dropout for LoRA layers
            bias="none",  # No bias for LoRA
            task_type="CAUSAL_LM",  # Task type
        )

        # Apply LoRA to the model
        model = get_peft_model(model, lora_config)

    return model, tokenizer

In [19]:
def get_training_args():
    """
    Define training arguments for fine-tuning.
    """
    return TrainingArguments(
        output_dir="./gemma-lora-finetuned",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=2,  # Adjust based on GPU memory
        per_device_eval_batch_size=2,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        save_steps=10_000,
        logging_dir="./logs",
        logging_steps=200,
        fp16=True,  # Mixed precision training
        gradient_accumulation_steps=4,  # Gradient accumulation
    )


In [20]:
 # Dataset path
dataset_path = "/home/cse/Desktop/VOIP_VISHING/REPOS/conversation-prediction/FINAL_DATASET2.csv"

    # Load and prepare the dataset
dataset = load_and_prepare_dataset(dataset_path)

    # Load the model and tokenizer
model_name = "google/gemma-2b"
model, tokenizer = load_model_and_tokenizer(model_name, use_lora=True)

    # Tokenize the dataset
tokenized_dataset = tokenize_dataset(dataset["train"], tokenizer)

    # Define training arguments
training_args = get_training_args()

    # Define the Trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,  # Use the same dataset for evaluation
    )

    # Fine-tune the model
trainer.train()

    # Save the fine-tuned model
model.save_pretrained("./gemma-lora-finetuned")
tokenizer.save_pretrained("./gemma-lora-finetuned")



OSError: There was a specific connection error when trying to load google/gemma-2b:
401 Client Error: Unauthorized for url: https://huggingface.co/google/gemma-2b/resolve/main/config.json (Request ID: Root=1-67d82640-6f8bd0d07e50534004fd4e2c;d266741e-32ad-4902-8e79-d25750cf6a5a)

Invalid credentials in Authorization header