In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import login

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from datetime import datetime

from dotenv import load_dotenv

In [39]:
load_dotenv()

# Hugginface login
HF_TOKEN = os.getenv('HF_TOKEN')
login(token = HF_TOKEN) #Sometimes this don't work, just pass the token directly as a str

# MLFlow login
MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI')
MLFLOW_TRACKING_USERNAME = os.getenv('MLFLOW_TRACKING_USERNAME')
MLFLOW_TRACKING_PASSWORD = os.getenv('MLFLOW_TRACKING_PASSWORD')

print(f"{MLFLOW_TRACKING_URI}, {MLFLOW_TRACKING_USERNAME}, {MLFLOW_TRACKING_PASSWORD}")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
https://mlflow.dev.humandev.org/, Reynold, 6BcTl1rNFSWq8uDIUJJ2ViHbKNqEsaALVXpkpvwo


# Load Data

In [3]:
train_df = pd.read_csv("../data/transformed/cleaned_train_data.csv")
train_df

Unnamed: 0,text,text_b,label
0,The method of any one of claims 7 to 9 wherein...,The radiator 220 may be injectionmolded such t...,1
1,A unit as claimed in claim 13 or 14 wherein sa...,On entering aseptic chamber 15 portion 2a of s...,1
2,A power management program that causes a compu...,It will also be appreciated that while a curre...,1
3,The system according to claim 10 wherein the e...,In 404 it is evaluated if a trend may be found...,1
4,The composition of claim 10 wherein the waterm...,The watersoluble solvent includes but is not l...,1
...,...,...,...
2936,The electrophotographic photosensitive member ...,The alkylmethacrylate preferably has an alkyl ...,0
2937,The coloring composition according to claims 1...,The dye represented by the formula 3 may be a ...,1
2938,The sensor of claim 1 wherein the cap 4 has a ...,Diaphragm void 322 forms a first pressure spac...,1
2939,The electronic device according to any one of ...,In addition to icons relating to actions that ...,1


In [4]:
test_df = pd.read_csv("../data/transformed/cleaned_test_data.csv")
test_df

Unnamed: 0,text,text_b,label
0,The apparatus of claim 10 wherein the private ...,Upon receiving the request the cloud key manag...,1
1,A device according to claim 2 or 3 wherein the...,Drive control of the laser scanning portion 1 ...,0
2,The electronic system according to claim 13 or...,In Example 7 the multistage charge amplifier o...,0
3,The biological indicator analyzer of any of cl...,The system depicted in Figures 27 and 30 may b...,0
4,Process according to any one of the preceding ...,According to one aspect of the present inventi...,0
...,...,...,...
778,An indiciareading system comprising a dimensio...,In the illustrated embodiment machine vision p...,0
779,A method according to claim 1 wherein the oper...,WANs typically cover broad areas for example r...,1
780,The fastener driving tool of any preceding cla...,The retraction collar assembly 64 includes a r...,1
781,The coherent communication system according to...,Here c represents the speed of lightrepresents...,0


We need to define an instruction(prompt).

In [5]:
# Define the prompt template
prompt_template = (
    "You are a skillful patent examiner with over 20 years of experience. "
    "You are reviewing patent prior art. Examine if the following chunk of a patent description constitutes prior art for the given claim.\n"
    "DESCRIPTION: {text_b}\n"
    "CLAIM: {text}"
)

# Create the prompt column in both datasets
train_df['prompt'] = train_df.apply(lambda row: prompt_template.format(text_b=row['text_b'], text=row['text']), axis=1)
test_df['prompt'] = test_df.apply(lambda row: prompt_template.format(text_b=row['text_b'], text=row['text']), axis=1)

train_df.to_csv('../data/transformed/train_with_prompt.csv', index=False)
test_df.to_csv('../data/transformed/test_with_prompt.csv', index=False)

In [6]:
pd.set_option('display.max_colwidth', None)

# Inspect the first row of the new DataFrame to verify the prompt column
train_row = train_df.iloc[0]
test_row = test_df.iloc[0]

print("First row of the training data with prompt:")
print(train_row)

print("First row of the test data with prompt:")
print(test_row)

First row of the training data with prompt:
text                                                                                                                                                                                                                                                                                                                                                                          The method of any one of claims 7 to 9 wherein the forming of the radiation device comprises forming the radiation device on the base through an adhesive part allowing the base to adhere to the radiation device.
text_b                                                                                                                                                                                                                                                                                                                                                                                    

Let's split the train set to have a small validation set during training, since we have little data I will split only 10 %. I will also make a stratified split to have the same distribution of the training data.

In [7]:
train_split_df, val_split_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['label'])

# Check the distribution of the target variable in both splits
print("\nDistribution of target variable in training split:")
print(train_split_df['label'].value_counts(normalize=True))
print("\nDistribution of target variable in validation split:")
print(val_split_df['label'].value_counts(normalize=True))


Distribution of target variable in training split:
label
1    0.584656
0    0.415344
Name: proportion, dtype: float64

Distribution of target variable in validation split:
label
1    0.583051
0    0.416949
Name: proportion, dtype: float64


Now I want to convert the data to a Hugginface Dataset and store it into my account.

In [8]:
train_split_df.reset_index(drop=True, inplace=True)
val_split_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Convert Pandas DataFrames to Huggingface Datasets
train_dataset = Dataset.from_pandas(train_split_df)
val_dataset = Dataset.from_pandas(val_split_df)
test_dataset = Dataset.from_pandas(test_df)

# Merge the datasets into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'text_b', 'label', 'prompt'],
        num_rows: 2646
    })
    validation: Dataset({
        features: ['text', 'text_b', 'label', 'prompt'],
        num_rows: 295
    })
    test: Dataset({
        features: ['text', 'text_b', 'label', 'prompt'],
        num_rows: 783
    })
})

In [9]:
# Save Dataset
dataset.push_to_hub("patentmatch_exp")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Reynold97/patentmatch_exp/commit/0ecad1f4b2941fdb608a52294d959d11cf64105e', commit_message='Upload dataset', commit_description='', oid='0ecad1f4b2941fdb608a52294d959d11cf64105e', pr_url=None, pr_revision=None, pr_num=None)

Now we have the final data ready

# Model Training

In the EDA notebook was concluded that we would need approximately a 4-5k context window. So, I decided to use LLama 3 8B with 8k context window as a base model.

I could follow 2 approaches.

##### 1. Classification with Label as part of text (CausalLM models)

Approach: Train the model to generate text that naturally appends the label at the end.

Input: "DESCRIPTION:... CLAIM:... ======"

Output: "DESCRIPTION:... CLAIM:... ====== Prior Art"

Use Case: This method is useful for applications requiring continuous text output that includes embedded analysis.


##### 2. Sequence Classification Head (SequenceClassification models)

Approach: Add a sequence classification head (linear layer) on top of the transformer. 

For that one can use LlamaForSequenceClassification, it uses the last token in order to do the classification, as other causal models (e.g. GPT-2) do.

Since it does classification on the last token, it requires to know the position of the last token. If a pad_token_id is defined in the configuration, it finds the last token that is not a padding token in each row. If no pad_token_id is defined, it simply takes the last value in each row of the batch. Since it cannot guess the padding tokens when inputs_embeds are passed instead of input_ids, it does the same (take the last value in each row of the batch).

Input: Prompt> "DESCRIPTION:... CLAIM:...."

Output: Direct classification (0-1).

I will use the 2nd approach since it best suits the problem and would be easier to calculate metrics.

I will use QLoRA to train just an adapter for the model. LoRA is low-rank decomposition method to reduce the number of trainable parameters which speeds up finetuning large models and uses less memory, and QLoRA uses LoRA but with a quantized base model to further reduce computation and memory requirements.

#### Quantization Config (for QLORA)

In [10]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # Optimized for A100
)

#### Load Model

In [11]:
model_name = "NousResearch/Meta-Llama-3-8B" # I don't have access to the original model 

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,
    device_map="auto",
    use_cache=False,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


#### Load Tokenizer

In [13]:
# This tokenizer config padd to the model max input length and truncate from model max input length
# I need to research further if the model was trained with a space at the beginning of sentence (add_prefix_space=True) I saw this in other repo
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True) 

tokenizer.pad_token_id = tokenizer.eos_token_id # adding pad token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left" # In my experience padding to the left is better

# Verify tokenizer settings
print(f"Pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
print(f"Padding side: {tokenizer.padding_side}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Pad token: <|end_of_text|>, ID: 128001
Padding side: left


In [14]:
model.config.pad_token_id = tokenizer.pad_token_id

#### Measuring the base model performance 

Let's see 1 entry tokenized

In [15]:
sentence = test_df.prompt.iloc[1]

# Tokenize the sentence with a specified max_length to force static padding
input = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True, max_length=8192)  

print(f"Sentence: {sentence}")
print(f"Tokenized sentence (input_ids): {input['input_ids']}")
print(f"Tokenized sentence (attention_mask): {input['attention_mask']}")

# Decode the tokenized input_ids to verify padding
decoded_sentence = tokenizer.decode(input['input_ids'][0], skip_special_tokens=False)
print(f"Decoded tokenized sentence: {decoded_sentence}")

Sentence: You are a skillful patent examiner with over 20 years of experience. You are reviewing patent prior art. Examine if the following chunk of a patent description constitutes prior art for the given claim.
DESCRIPTION: Drive control of the laser scanning portion 1 shakecorrecting portion 20 according to the first embodiment of the present invention is now described with reference to Figs.2 and 4 to 16.
CLAIM: A device according to claim 2 or 3 wherein the spiral shape has a first end 60 61 anchored to the fixed structure and a second end 64 65 anchored to the tiltable structure 52.
Tokenized sentence (input_ids): tensor([[128001, 128001, 128001,  ...,    220,   4103,     13]])
Tokenized sentence (attention_mask): tensor([[0, 0, 0,  ..., 1, 1, 1]])
Decoded tokenized sentence: <|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_tex

Now let's see the base model performance

In [16]:
# Convert summaries to a list
sentences = test_df.prompt.tolist()

# Define the batch size
batch_size = 16

# Initialize an empty list to store the model outputs
all_outputs = []

# Process the sentences in batches
for i in range(0, len(sentences), batch_size):
    # Get the batch of sentences
    batch_sentences = sentences[i:i + batch_size]

    # Tokenize the batch. Here I will use dynamic padding for efficiency
    inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True,  max_length=8192)

    # Move tensors to the device where the model is (e.g., GPU or CPU)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    # Perform inference and store the logits
    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])

In [17]:
final_outputs = torch.cat(all_outputs, dim=0)
final_outputs

tensor([[-1.6533, -0.8901],
        [-3.5527, -0.4719],
        [-2.4883, -2.4102],
        ...,
        [-4.0352, -1.1797],
        [-4.2148, -0.0417],
        [-1.6826, -0.4602]], device='cuda:0', dtype=torch.float16)

In [18]:
final_outputs.argmax(axis=1)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [19]:
test_df['base_predictions']=final_outputs.argmax(axis=1).cpu().numpy()
test_df['base_predictions']

0      1
1      1
2      1
3      1
4      1
      ..
778    1
779    1
780    1
781    1
782    1
Name: base_predictions, Length: 783, dtype: int64

In [20]:
def get_performance_metrics(df_test):
  y_test = df_test.label
  y_pred = df_test.base_predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [21]:
get_performance_metrics(test_df)

Confusion Matrix:
[[ 15 398]
 [ 12 358]]

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.04      0.07       413
           1       0.47      0.97      0.64       370

    accuracy                           0.48       783
   macro avg       0.51      0.50      0.35       783
weighted avg       0.52      0.48      0.34       783

Balanced Accuracy Score: 0.5019435900791833
Accuracy Score: 0.4763729246487867


Now we have the baseline.

#### Data preparation and Data Collator

In [22]:
# Define a data processing function
def preprocess_function(examples):
    # Tokenize the input text
    return tokenizer(examples['prompt'], truncation=True, max_length=8192)

In [23]:
# Remove columns not needed by the model
train_dataset = train_dataset.remove_columns(['text', 'text_b'])
val_dataset = val_dataset.remove_columns(['text', 'text_b'])


In [24]:
train_dataset[0]

{'label': 1,
 'prompt': 'You are a skillful patent examiner with over 20 years of experience. You are reviewing patent prior art. Examine if the following chunk of a patent description constitutes prior art for the given claim.\nDESCRIPTION: Over the lifetime of a rotor mast the rotor mast bearing may become damaged by for example surface wear fretting fabrication error accidents involving the rotor mast occurring during transport assembly disassembly or repair of the rotor mast any plurality thereof or any combination thereof.Because mechanical failure of a rotor mast is highly likely to result in catastrophic damage to the helicopter and loss of life of the passengers therein rotor mast designs often set forth exacting dimension and tolerance requirements.Thus when damage or wear compromises the dimensional precision of a rotor mast protocol often dictates repair of the rotor mast or replacement if repair is not feasible.Referring to FIG.2B a longitudinal crosssectional view of the b

In [25]:
# Apply the data processing function to the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2646 [00:00<?, ? examples/s]

Map:   0%|          | 0/295 [00:00<?, ? examples/s]

In [26]:
train_dataset[0]

{'label': 1,
 'prompt': 'You are a skillful patent examiner with over 20 years of experience. You are reviewing patent prior art. Examine if the following chunk of a patent description constitutes prior art for the given claim.\nDESCRIPTION: Over the lifetime of a rotor mast the rotor mast bearing may become damaged by for example surface wear fretting fabrication error accidents involving the rotor mast occurring during transport assembly disassembly or repair of the rotor mast any plurality thereof or any combination thereof.Because mechanical failure of a rotor mast is highly likely to result in catastrophic damage to the helicopter and loss of life of the passengers therein rotor mast designs often set forth exacting dimension and tolerance requirements.Thus when damage or wear compromises the dimensional precision of a rotor mast protocol often dictates repair of the rotor mast or replacement if repair is not feasible.Referring to FIG.2B a longitudinal crosssectional view of the b

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Eval metric

In [28]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

#### Lora Config

In [29]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 32, # scaling factor for LoRA activations vs pre-trained weight activations
    #target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], #default selects all possible traiable modules, tipically linear layers.
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', 
    task_type = 'SEQ_CLS' #sequence class task
)
     

In [30]:
# This prepare the quantized model for training
model = prepare_model_for_kbit_training(model)

Now we prepare the peft model with the LoRA adapter. We can see the added layers.

In [31]:
peft_model = get_peft_model(model, lora_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=40

Let's see how many parameters we will train 

In [32]:
peft_model.print_trainable_parameters()


trainable params: 6,823,936 || all params: 7,511,756,800 || trainable%: 0.0908


#### MLFlow config

In [45]:
# If you have provided the name of an experiment that does not exist or that existed in MLflow but was deleted, create a new experiment.
experiment_name = "Reynold_PatentMatchBaseline"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

#### Training parameters

In [46]:
run_name=f"LlaMa3-QLoRA-PatentMatch-v0.1-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}"   
output_dir="../artifacts/" + run_name 

training_args = TrainingArguments(    
    report_to="mlflow", # mlflow logging 
    run_name=run_name, # MLflow run   
    output_dir=output_dir, # output directory
    push_to_hub = "True", # push to Hugginface account
    hub_strategy = "all_checkpoints", # push all checkpoints
    hub_always_push = True, 
    num_train_epochs=3,
    weight_decay=0.01,   
    per_device_train_batch_size=5,
    gradient_accumulation_steps=5,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit", # from the QLoRA paper    
    bf16=True,
    #fp16=True,  # Enable mixed precision training
    learning_rate=2e-5,
    lr_scheduler_type="constant",
    logging_steps=50,
    logging_dir="../logs",
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=500,                # Save checkpoints every 500 steps
    eval_strategy="steps", # Evaluate the model every logging step
    eval_steps=500,               # Evaluate and save checkpoints every 500 steps
    do_eval=True,
    load_best_model_at_end=True,
    #ddp_find_unused_parameters=False,  # Set for distributed training
)

In [47]:
# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [44]:
# FINALLY
trainer.train()

MlflowException: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: 'Permission denied'

In [49]:
import os
import mlflow


# Set the experiment name
experiment_name = "Reynold_test"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

# Example training code
with mlflow.start_run():
    mlflow.log_param("param1", 5)
    mlflow.log_metric("metric1", 0.89)
    # Add your training code here
    # mlflow.log_artifact("path/to/artifact")

print("Run completed and logged to the MLflow server.")

MlflowException: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 403 != 200. Response body: 'Permission denied'