In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import torch
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import random
from sklearn.utils import shuffle


# load dataset

In [6]:
df = pd.read_csv('../FINAL_GEMINI_CHATGPT_FULL_fixed_20250602_230757.csv')
print(len(df))

# # only when you want to use advers as validation
# df = pd.read_csv('../FINAL_GEMINI_CHATGPT_DEFAULT_fixed_20250602_230832.csv')
# advers_df = pd.read_csv('../FINAL_GEMINI_CHATGPT_ADVERS_fixed_20250602_230658.csv')
# print(len(advers_df))

50000


#### --- 1. Data Preparation ---

In [7]:
def split_dataframe_by_document_ids(df, test_size=0.2):
    """
    Split dataframe into train and validation sets ensuring documents stay together.
    """
    # Get unique document IDs and calculate split
    unique_ids = set(df['document_id'])
    num_test_docs = int(len(unique_ids) * test_size)
    
    # Randomly select test documents
    test_ids = np.random.choice(list(unique_ids), size=num_test_docs, replace=False)
    
    # Split dataframe
    val_df = df[df['document_id'].isin(test_ids)].copy()
    train_df = df[~df['document_id'].isin(test_ids)].copy()
    
    print(f"Unique documents: {len(unique_ids)}")
    print(f"Test documents: {len(test_ids)}")
    print(f"Test set size: {len(val_df)}")
    print(f"Training set size: {len(train_df)}")
    
    return train_df, val_df

def split_dataframe_by_model(df, test_size=0.2, model_name='chatgpt'):
    # Get unique document IDs and calculate split
    unique_ids = set(df['document_id'])
    num_test_docs = int(len(unique_ids) * test_size)
    
    # Randomly select test documents
    test_ids = np.random.choice(list(unique_ids), size=num_test_docs, replace=False)
    
    # Split dataframe
    val_df = df[df['document_id'].isin(test_ids)].copy()
    val_df = val_df[val_df['source'] != model_name]
    train_df = df[~df['document_id'].isin(test_ids)].copy()
    train_df = train_df[train_df['source'] == model_name]
    print(f"Unique documents: {len(unique_ids)}")
    print(f"Test documents: {len(test_ids)}")
    print(f"Test set size: {len(val_df)}")
    print(f"Training set size: {len(train_df)}")
    return train_df, val_df

def split_dataframe_by_prompt(df, test_size=0.2, task='rephrase'):
    # Get unique document IDs and calculate split
    unique_ids = set(df['document_id'])
    num_test_docs = int(len(unique_ids) * test_size)
    
    # Randomly select test documents
    test_ids = np.random.choice(list(unique_ids), size=num_test_docs, replace=False)
    
    # Split dataframe
    val_df = df[df['document_id'].isin(test_ids)].copy()
    val_df = val_df[val_df['task'] == task]
    train_df = df[~df['document_id'].isin(test_ids)].copy()
    train_df = train_df[train_df['task'] != task]
    print(f"Unique documents: {len(unique_ids)}")
    print(f"Test documents: {len(test_ids)}")
    print(f"Test set size: {len(val_df)}")
    print(f"Training set size: {len(train_df)}")
    return train_df, val_df

# Split the data by document ids / model / prompt
train_df, val_df = split_dataframe_by_document_ids(df)
# Or use your own split
# train_df =df
# val_df = advers_df

Unique documents: 3257
Test documents: 651
Test set size: 10039
Training set size: 39961


In [8]:
# Create lists of texts and corresponding labels
train_texts = train_df['chapter'].tolist() + train_df['generated'].tolist()
train_labels = [0] * len(train_df) + [1] * len(train_df)

val_texts = val_df['chapter'].tolist() + val_df['generated'].tolist()
val_labels = [0] * len(val_df) + [1] * len(val_df)

In [9]:
def truncate_texts(texts, max_length=4*128):
    return [text[:max_length] for text in texts]

# Apply truncation to both training and validation texts
train_texts = truncate_texts(train_texts)
val_texts = truncate_texts(val_texts)

#### --- 2. Load Tokenizer and Model ---

In [10]:
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### --- 3. Tokenize Data ---

In [11]:
# Tokenize the texts
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True)
val_encodings = tokenizer(val_texts, padding="max_length", truncation=True)


In [12]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Ensure all encoding keys are converted to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        # Use the length of one of the encoding lists (e.g., 'input_ids')
        return len(self.encodings['input_ids'])

In [13]:
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

#### --- 5. Set up Trainer ---

In [14]:
training_args = TrainingArguments(
    output_dir='./results_multilingual_bert',          # Output directory for checkpoints and logs
    num_train_epochs=1,              # Reduce epochs for a quicker example run
    per_device_train_batch_size=8,   # Adjust based on GPU memory
    per_device_eval_batch_size=16,  # Adjust based on GPU memory
    warmup_steps=100,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=50,               # Log metrics every 50 steps
    eval_strategy="steps",           # Evaluate during training
    eval_steps=500,                  # Evaluate every 500 steps
    save_strategy="steps",           # Save checkpoint strategy
    save_total_limit=2,             # Limit the total amount of checkpoints
    save_steps=1000,                  # Save checkpoint every 500 steps
    load_best_model_at_end=True,     # Load the best model found during training
    metric_for_best_model="accuracy", # Use accuracy to determine the best model
    greater_is_better=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if CUDA is available
    report_to="none"                   # Disable reporting to wandb/tensorboard for this example
)

# Define evaluation metric (accuracy)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    false_positive_rate = np.sum((preds == 1) & (labels == 0)) / np.sum(labels == 0)
    false_negative_rate = np.sum((preds == 0) & (labels == 1)) / np.sum(labels == 1)
    return {
        'accuracy': acc,
        'f1': f1,
        'false_positive_rate': false_positive_rate,
        'false_negative_rate': false_negative_rate
    }

In [None]:
# Initialize Trainer and Train
trainer = Trainer(
    model=model,                         # The instantiated Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
    compute_metrics=compute_metrics      # Function to compute metrics
)

# --- 6. Train the Model ---
print("Starting training...")
trainer.train()

print("Training finished.")

# --- 7. Evaluate the Model ---
print("Evaluating model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,False Positive Rate,False Negative Rate
500,0.3634,0.370366,0.866048,0.874363,0.200138,0.067766
1000,0.3119,0.351191,0.892324,0.893315,0.116961,0.09839
1500,0.2582,0.353431,0.883829,0.891643,0.188284,0.044058
2000,0.2994,0.261897,0.921762,0.92021,0.058777,0.097698
2500,0.2784,0.262783,0.926702,0.925659,0.059271,0.087326
3000,0.3495,0.271461,0.911192,0.904797,0.021634,0.155981
3500,0.2591,0.234867,0.922355,0.923302,0.089993,0.065297
4000,0.2485,0.219786,0.933765,0.932549,0.048207,0.084264
4500,0.2496,0.221509,0.936531,0.934234,0.028549,0.09839
5000,0.2443,0.269349,0.93174,0.931652,0.066976,0.069545


#### --- Optional: Save the fine-tuned model and tokenizer ---Please check name--

In [15]:
save_path = "./fine_tuned_multilingual_bert"

# Save model and tokenizer, overwriting if it already exists
print(f"Saving/Updating model and tokenizer to {save_path}...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model and tokenizer saved to {save_path}")


Saving/Updating model and tokenizer to ./fine_tuned_romanian_bert_128_full2...
Model and tokenizer saved to ./fine_tuned_romanian_bert_128_full2
