In [1]:
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments, LlamaForSequenceClassification
import torch
from torch.utils.data import DataLoader
import os
import subprocess
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ GPU Utilization Check
def check_gpu():
    if torch.cuda.is_available():
        print(f"✅ GPU Available: {torch.cuda.get_device_name(0)}")
        print(f"Memory Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
        print(f"Memory Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
    else:
        print("⚠️ GPU not available. Running on CPU.")

# ✅ Check GPU before starting
check_gpu()

# ✅ Download Dataset from GitHub
repo_url = "https://github.com/ltgoslo/norec.git"
dataset_path = "./norec"

if not os.path.exists(dataset_path):
    subprocess.run(["git", "clone", repo_url, dataset_path])
    print("✅ Dataset cloned successfully!")
else:
    print("ℹ️ Dataset already exists. Skipping download.")

# ✅ Check GPU after cloning (to ensure no unexpected memory usage)
check_gpu()

✅ GPU Available: NVIDIA H100 80GB HBM3
Memory Allocated: 0.00 MB
Memory Reserved: 0.00 MB
ℹ️ Dataset already exists. Skipping download.
✅ GPU Available: NVIDIA H100 80GB HBM3
Memory Allocated: 0.00 MB
Memory Reserved: 0.00 MB


In [3]:
import os
os.environ["NEBIUS_API_KEY"] = "eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnb29nbGUtb2F1dGgyfDExODExNzk3MzQ3MzE2MzA4NzM1NyIsInNjb3BlIjoib3BlbmlkIG9mZmxpbmVfYWNjZXNzIiwiaXNzIjoiYXBpX2tleV9pc3N1ZXIiLCJhdWQiOlsiaHR0cHM6Ly9uZWJpdXMtaW5mZXJlbmNlLmV1LmF1dGgwLmNvbS9hcGkvdjIvIl0sImV4cCI6MTg5NjEwMTEyNiwidXVpZCI6IjJiY2ZlYmU0LTkyZjMtNGY3NC1iZTFlLWYwNTMxNzI0ZWYyMiIsIm5hbWUiOiJNZXRhIiwiZXhwaXJlc19hdCI6IjIwMzAtMDEtMzFUMTQ6NDU6MjYrMDAwMCJ9.HWBvay7K9c0XcPwTRBk3tX86BlBi6mH6K-YC18ohdKM"
#

In [4]:
import os
print(os.environ.get("NEBIUS_API_KEY"))


eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnb29nbGUtb2F1dGgyfDExODExNzk3MzQ3MzE2MzA4NzM1NyIsInNjb3BlIjoib3BlbmlkIG9mZmxpbmVfYWNjZXNzIiwiaXNzIjoiYXBpX2tleV9pc3N1ZXIiLCJhdWQiOlsiaHR0cHM6Ly9uZWJpdXMtaW5mZXJlbmNlLmV1LmF1dGgwLmNvbS9hcGkvdjIvIl0sImV4cCI6MTg5NjEwMTEyNiwidXVpZCI6IjJiY2ZlYmU0LTkyZjMtNGY3NC1iZTFlLWYwNTMxNzI0ZWYyMiIsIm5hbWUiOiJNZXRhIiwiZXhwaXJlc19hdCI6IjIwMzAtMDEtMzFUMTQ6NDU6MjYrMDAwMCJ9.HWBvay7K9c0XcPwTRBk3tX86BlBi6mH6K-YC18ohdKM


In [5]:
import os
from openai import OpenAI, OpenAIError

# Get API Key
api_key = os.environ.get("NEBIUS_API_KEY")
if not api_key:
    raise ValueError("NEBIUS_API_KEY not found! Set it as an environment variable.")

# Initialize Nebius AI client
client = OpenAI(
    base_url="https://api.studio.nebius.ai/v1/",
    api_key=api_key
)

try:
    # Make a request
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct",
        temperature=0,
        messages=[{"role": "system", "content": "Hello, Nebius AI!"}]
    )

    print(response.to_json())

except OpenAIError as e:
    print("🚨 API Error:", e)


{
  "id": "chatcmpl-d5bcb5a45a4c4cb4ab03d3d9c972fa6e",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?",
        "refusal": null,
        "role": "assistant",
        "audio": null,
        "function_call": null,
        "tool_calls": []
      },
      "stop_reason": null
    }
  ],
  "created": 1738428776,
  "model": "meta-llama/Llama-3.3-70B-Instruct",
  "object": "chat.completion",
  "service_tier": null,
  "system_fingerprint": null,
  "usage": {
    "completion_tokens": 25,
    "prompt_tokens": 36,
    "total_tokens": 61,
    "completion_tokens_details": null,
    "prompt_tokens_details": null
  },
  "prompt_logprobs": null
}


In [6]:
!pip install sentencepiece




In [7]:
import os

data_folders = [os.path.join(dataset_path, "data/train"),
                os.path.join(dataset_path, "data/dev")]

output_file = os.path.join(dataset_path, "norec_combined.txt")  # Save inside dataset_path

with open(output_file, "w", encoding="utf-8") as outfile:
    for folder in data_folders:
        if os.path.exists(folder):  # Ensure the folder exists
            for filename in os.listdir(folder):
                file_path = os.path.join(folder, filename)
                if filename.endswith(".txt"):
                    with open(file_path, "r", encoding="utf-8") as infile:
                        outfile.write(infile.read() + "\n")  # Add newline for separation

print(f"✅ Merged dataset saved as {output_file}")


✅ Merged dataset saved as ./norec/norec_combined.txt


In [8]:


import sentencepiece as spm

# Ensure the dataset file exists
data_file = os.path.join(dataset_path, "norec_combined.txt")
if not os.path.exists(data_file):
    raise FileNotFoundError(f"Dataset file not found: {data_file}")

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input=data_file,
    model_prefix="norec_tokenizer",
    vocab_size=32000,
    model_type="bpe",
    character_coverage=0.9995,
    max_sentence_length=4096
)

print("Tokenizer training complete! Model saved as 'norec_tokenizer.model'")

✅ GPU Available: NVIDIA H100 80GB HBM3
Memory Allocated: 0.00 MB
Memory Reserved: 0.00 MB


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-hf.
401 Client Error. (Request ID: Root=1-679e5178-5c8a237d4d06ad496078574c;3fc745ca-6464-4a39-a71c-8f59e208340a)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

✅ GPU Available: NVIDIA H100 80GB HBM3
Memory Allocated: 0.00 MB
Memory Reserved: 0.00 MB


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./norec/norec_combined.txt
  input_format: 
  model_prefix: norec_tokenizer
  model_type: BPE
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4096
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_

KeyboardInterrupt: 

In [None]:
# Fine-tune for causal language modeling
model_clm = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

training_args_clm = TrainingArguments(
    output_dir="./llama_norwegian_clm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs_clm",
    logging_steps=100,
    fp16=True,
    report_to="none"
)

trainer_clm = Trainer(
    model=model_clm,
    args=training_args_clm,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

trainer_clm.train()

# Fine-tune for sequence classification
model_cls = LlamaForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: positive, neutral, negative

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["test"], batch_size=8)

training_args_cls = TrainingArguments(
    output_dir="./llama_norwegian_cls",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5
)

trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

trainer_cls.train()

ModuleNotFoundError: No module named 'datasets'