In [None]:
!pip install transformers datasets accelerate bitsandbytes peft trl huggingface_hub

In [None]:
import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from huggingface_hub import login
import os

# %% Login to Hugging Face
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load your data from Google Drive
data = load_jsonl('/content/drive/MyDrive/sft_marcus.jsonl')

# Format data for training
def format_prompt(example):
    return f"<s>[INST] {example['prompt']} [/INST] {example['response']} </s>"

formatted_data = [{"text": format_prompt(item)} for item in data]
dataset = Dataset.from_list(formatted_data)


In [None]:

model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_8bit=True  # Use 8-bit to save memory
)


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding=True, 
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    logging_steps=1,
    save_strategy="epoch",
    evaluation_strategy="no",
    learning_rate=2e-5,
    fp16=True,
    push_to_hub=True,
    hub_model_id="iwswordpress/marcus",
    remove_unused_columns=False,
)


In [None]:

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Start training
print("Starting training...")
trainer.train()

# Save and push model
print("Saving model...")
trainer.save_model()
tokenizer.save_pretrained("./results")

# Push to hub
trainer.push_to_hub()

In [None]:


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


print("Training complete! Model saved to iwswordpress/marcus")


def test_model():
    # Load the fine-tuned model
    model.eval()
    
    test_prompt = "What is your philosophy on leadership?"
    formatted_prompt = f"<s>[INST] {test_prompt} [/INST]"
    
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Test Response:", response)

# Uncomment to test
test_model()