## Login to Hugging Face

In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

## Imports

In [None]:
from tqdm import tqdm
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

# datasets
import numpy as np
import pandas as pd

## Device

In [None]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

## Hyperparameters

In [None]:
# seed
seed=42

# Tokenizer arguments
max_length=32 # maximum length of the text that can go to the model
padding="max_length" # padding strategy: "longest", "max_length", "do_not_pad"
truncation=True # truncate the text if it exceeds the maximum length

# mixed precision
dtype=torch.bfloat16 # data type

# training arguments
training_args = TrainingArguments(
    output_dir="./results", # output directory
    logging_dir="./logs", # logging directory
    save_strategy="epoch", # save strategy
    logging_strategy="epoch", # logging strategy
    evaluation_strategy="epoch", # evaluation strategy
    metric_for_best_model="loss", # metric for best model
    save_total_limit=1, # save total limit
    greater_is_better=False, # greater is better
    load_best_model_at_end=True, # load best model at the end

    learning_rate=2e-5, # learning rate
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=1, # training batch size
    per_device_eval_batch_size=1, # evaluation batch size
    optim="adamw_torch", # optimizer
    weight_decay=0.01, # weight decay
    lr_scheduler_type="cosine", # learning rate scheduler
    seed=seed # seed
)

# validation split
validation_size=0.1

## Model

In [None]:
model_id = "deepseek-ai/deepseek-math-7b-rl"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype
)

In [None]:
model

## Dataset

## Training