In [45]:
import pandas as pd
import os

In [46]:
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [47]:
all_statements = pd.read_csv('all_statements.csv')

In [48]:
train_set = all_statements.sample(frac=0.85, random_state=42)
val_set = all_statements.drop(train_set.index)

In [49]:
train_set

Unnamed: 0,statement,labels
65962,American International Group has headquarters ...,1
124723,Artemis brought and relieved disease in men.,2
162193,The pH scale measures the sweetness or bittern...,0
91286,Leonardo Bonucci is a three-time member of the...,0
55979,Ad-Rock's spouse is an American who was born i...,1
...,...,...
87432,Wild Ones is by an American Singer.,1
86941,Purple has a genre.,1
64096,"Human uses for gazelle include pets, research,...",0
3984,Legion is the main character and title charact...,1


In [50]:
val_set

Unnamed: 0,statement,labels
5,The parrot has the atomic number of mammal.,0
15,Natalie Wood worked with George Seaton.,1
16,Ireland was a country Mother Teresa lived in.,1
18,"Yellow Flicker Beat has been, at the American ...",1
26,James Garner did not star in television series.,0
...,...,...
169027,Dark Phoenix is an alias that Jean Grey is kno...,1
169036,Las Vegas is famous for its nightlife.,1
169044,"In 2009, Scarlett Johansson released an album.",1
169053,Harlem is where Sean Combs was born.,1


In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

In [52]:
train_dataset = Dataset.from_pandas(train_set[['statement', 'labels']])
val_dataset = Dataset.from_pandas(val_set[['statement', 'labels']])

In [60]:
model_name = "google/gemma-3-4b-it"
tokenizer_name = None
#tokenizer_name = 'bert-base-uncased' # in case the tokenizer of the original model does not work / is not applicable for some reason
tokenizer_name = tokenizer_name if tokenizer_name else model_name
print(f"Using model {model_name} and tokenizer {tokenizer_name}")

Using model google/gemma-3-4b-it and tokenizer google/gemma-3-4b-it


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_function(examples):
  # Tokenize the 'statement' text. `padding="max_length"` ensures all sequences have the same length.
  # `truncation=True` cuts off text longer than the model's max input size.
  return tokenizer(examples["statement"], padding="max_length", truncation=True)

# Apply the tokenizer to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove the original text column as the model doesn't need it after tokenization
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["statement"])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["statement"])

# Set the format to PyTorch tensors (or TensorFlow if you use TF)
tokenized_train_dataset.set_format("torch")
tokenized_val_dataset.set_format("torch")

In [55]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1) # Get the index of the highest probability
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') # Use 'weighted' for multiclass
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    # Quantization can still be useful for faster inference or lower VRAM on consumer GPUs
    # load_in_4bit=True, # Uncomment if needed
    device_map="auto", # Good practice, handles device placement
    # torch_dtype=torch.bfloat16, # Optional: Use if supported
    # trust_remote_code=True, # Optional: Uncomment if required by the model
)

In [None]:
save_path = 'fine_tuned_weights' # where you will save the results too?

num_train_epochs = 100 # let's keep it big for now

In [None]:
# Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=save_path,          # Directory to save the model and results
    num_train_epochs=num_train_epochs,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    learning_rate=2e-5,              # Learning rate for the optimizer
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log metrics every X steps
    evaluation_strategy="epoch",     # Evaluate model at the end of each epoch
    save_strategy="epoch",           # Save model checkpoint at the end of each epoch
    load_best_model_at_end=True,     # Load the best model found during training at the end
    metric_for_best_model="accuracy",# Use accuracy to determine the best model
)

# Initialize the Trainer ---
trainer = Trainer(
    model=model,                         # The instantiated Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_val_dataset,  # Evaluation dataset
    compute_metrics=compute_metrics,     # Function to compute metrics during evaluation
)

In [None]:
# Train the Model ---
trainer.train()

# Evaluate the Model ---
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the Model ---
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)