In [1]:
# Install required libraries (Uncomment and run if needed)
# !pip install transformers datasets torch scikit-learn

import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load the uploaded CSV file (Dream symbols and interpretations)
file_path = 'dreams_interpretations.csv'
dream_data = pd.read_csv(file_path)

# Extract relevant columns for model training
dream_data["text"] = "Dream: " + dream_data["Dream Symbol"] + " Interpretation: " + dream_data["Interpretation"]

# Keep only the text column and drop missing values
dream_data_prepared = dream_data[["text"]].dropna()

# Split the data into train (80%) and validation (20%) sets
train_df, val_df = train_test_split(dream_data_prepared, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [2]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the train and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

In [3]:
# Load the pre-trained BERT model for masked language modeling
model = BertForMaskedLM.from_pretrained("bert-large-uncased")

# Define a data collator for MLM (randomly masks words)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 15% of tokens will be masked
)


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

In [4]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_large_finetuned_dreams",  # Directory to save the fine-tuned model
    num_train_epochs=3,                          # Number of epochs
    per_device_train_batch_size=4,               # BERT Large requires a smaller batch size
    per_device_eval_batch_size=4,                # Evaluation batch size
    save_strategy="epoch",                       # Save at the end of each epoch
    evaluation_strategy="epoch",                 # Evaluate at the end of each epoch
    logging_steps=10,                            # Log every 10 steps
    logging_dir="./logs",                        # Directory for logs
    save_total_limit=2,                          # Keep only the latest 2 checkpoints
    load_best_model_at_end=True,                 # Load best model at the end
    report_to="tensorboard"                      # Log to TensorBoard
)




In [5]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [6]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,1.0506,1.291346
2,1.0265,1.171167
3,1.0085,1.030822


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=543, training_loss=1.1952701831171306, metrics={'train_runtime': 5332.2802, 'train_samples_per_second': 0.406, 'train_steps_per_second': 0.102, 'total_flos': 503992584055296.0, 'train_loss': 1.1952701831171306, 'epoch': 3.0})

In [7]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./bert_large_finetuned_dreams")
tokenizer.save_pretrained("./bert_large_finetuned_dreams")


('./bert_large_finetuned_dreams\\tokenizer_config.json',
 './bert_large_finetuned_dreams\\special_tokens_map.json',
 './bert_large_finetuned_dreams\\vocab.txt',
 './bert_large_finetuned_dreams\\added_tokens.json')