In [None]:
! pip install bitsandbytes
! pip install huggingface_hub
! pip install transformers
! pip install torch
! pip install accelerate

# OR
# ! pip install -r requirements.txt



In [None]:
# Make sure your Google Drive is mounted!
from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define the local path where your model is saved
local_model_path = "/content/drive/My Drive/vllm_models/Meta-Llama-3.1-8B-Instruct"

print(f"Loading tokenizer from {local_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print(f"Loading model from {local_model_path} (This will take a bit, and requires a lot of RAM!)...")
# Make sure you have enough GPU RAM for the 70B model!
# We'll load it on the GPU (cuda) and specify torch_dtype=torch.bfloat16 for better performance/memory if supported,
# or torch.float16 if not. Since it's bnb-4bit, it's already quantized.
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=torch.bfloat16, # Use bfloat16 for better precision if your GPU supports it
    device_map="auto" # Automatically maps layers to available devices (GPUs)
)

print("Model loaded! Now, let's try a query!")

# --- Querying the model ---
# This is a basic way to interact with the model.
# For your Author/Critic LLM setup, you'll build more complex interaction loops!

prompt = "Write a short, whimsical story about a squirrel who invents a tiny, acorn-powered spaceship."
messages = [
    {"role": "system", "content": "You are a helpful and creative story-writing assistant."},
    {"role": "user", "content": prompt}
]

# Apply the chat template to format your messages for the Llama-3.1-Instruct model
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device) # Move inputs to the same device as the model

print("\nGenerating response...")
output_ids = model.generate(
    input_ids,
    max_new_tokens=1000, # How long of a response you want
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1 # Helps prevent repetitive text
)

# Decode the generated text
response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

print("\n--- Generated Story ---")
print(response)
print("-----------------------")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading tokenizer from /content/drive/My Drive/vllm_models/Meta-Llama-3.1-8B-Instruct...
Loading model from /content/drive/My Drive/vllm_models/Meta-Llama-3.1-8B-Instruct (This will take a bit, and requires a lot of RAM!)...
Model loaded! Now, let's try a query!

Generating response...

--- Generated Story ---
**The Acorn-Aerius**

In the heart of Oakwood Forest, where sunlight filtered through the leafy canopy above and ancient trees whispered secrets to one another, there lived an inventive squirrel named Squeaky. While his fellow forest dwellers were content with gathering nuts and lounging in the sun, Squeaky's imagination soared on the wings of creativity.

One day, as he scampered up a towering oak, Squeaky spotted a particularly plump acorn lying ripe on a branch. His eyes twinkled with inspiration. He carefully plucked the acorn, feeling its smooth su