In [2]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load the uploaded CSV file (Dream symbols and interpretations)
file_path = 'dreams_interpretations.csv'
dream_data = pd.read_csv(file_path)

# Extract relevant columns for model training
dream_data["text"] = "Dream: " + dream_data["Dream Symbol"] + " Interpretation: " + dream_data["Interpretation"]

# Keep only the text column
dream_data_prepared = dream_data[["text"]]

# Check for missing values
dream_data_prepared = dream_data_prepared.dropna()

# Split the data into train (80%) and validation (20%) sets
train_df, val_df = train_test_split(dream_data_prepared, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the train and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Load the pre-trained DistilBERT model for masked language modeling
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

# Define a data collator for MLM (randomly masks words)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 15% of tokens will be masked
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_finetuned_dreams",  # Directory to save the fine-tuned model
    num_train_epochs=3,                          # Number of epochs
    per_device_train_batch_size=8,               # Training batch size
    per_device_eval_batch_size=8,                # Evaluation batch size
    save_strategy="epoch",                       # Save at the end of each epoch
    evaluation_strategy="epoch",                 # Evaluate at the end of each epoch
    logging_steps=10,                            # Log every 10 steps
    logging_dir="./logs",                        # Directory for logs
    save_total_limit=2,                          # Keep only the latest 2 checkpoints
    load_best_model_at_end=True,                 # Load best model at the end
    report_to="tensorboard"                      # Log to TensorBoard
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./distilbert_finetuned_dreams")
tokenizer.save_pretrained("./distilbert_finetuned_dreams")


Map:   0%|          | 0/721 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [3]:
import random
from transformers import pipeline

# Load the fine-tuned DistilBERT model for masked word prediction
mask_filler = pipeline("fill-mask", model="./distilbert_finetuned_dreams", tokenizer="./distilbert_finetuned_dreams")

# 🔹 Retrieve the original text dataset before tokenization
train_texts = train_df["text"].tolist()  # Get original train dreams
val_texts = val_df["text"].tolist()  # Get original validation (test) dreams

# 🔹 Select 2 random dreams from the validation dataset (test set)
sample_test_dreams = random.sample(val_texts, 2)

# 🔹 Select 2 random dreams from the training dataset (train set)
sample_train_dreams = random.sample(train_texts, 2)

# Combine both lists
sample_dreams = sample_test_dreams + sample_train_dreams

# Function to predict missing words in a dream interpretation
def predict_interpretation(dream_text):
    """
    Uses the fine-tuned DistilBERT model to predict missing words in a dream interpretation.
    """
    masked_text = f"Dream: {dream_text} Interpretation: This dream represents [MASK] emotions related to [MASK] and subconscious [MASK]."

    # Generate predictions for each masked token
    predictions = mask_filler(masked_text)

    # Construct interpretation with predicted words
    filled_text = masked_text
    for prediction in predictions:
        filled_text = filled_text.replace("[MASK]", prediction[0]['token_str'], 1)

    return filled_text

# Process and print results for all selected dreams
for i, dream in enumerate(sample_dreams):
    interpretation = predict_interpretation(dream)
    dataset_type = "Test" if i < 2 else "Train"  # First 2 from Test, Last 2 from Train
    print(f"({dataset_type} Set) Dream: {dream}")
    print(f"Predicted Interpretation: {interpretation}")
    print("-" * 50)


Device set to use cpu


(Test Set) Dream: Dream: Yourself Interpretation: To see yourself in your dream is a reflection of how you act and behave in your waking life. Consider what you are doing and how you are feeling in the dream for additional significance.
Predicted Interpretation: Dream: Dream: Yourself Interpretation: To see yourself in your dream is a reflection of how you act and behave in your waking life. Consider what you are doing and how you are feeling in the dream for additional significance. Interpretation: This dream represents your emotions related to life and subconscious desires.
--------------------------------------------------
(Test Set) Dream: Dream: Screen Interpretation: To see a window screen in your dream implies that you are being cautiously optimistic about the good news you receive.
Predicted Interpretation: Dream: Dream: Screen Interpretation: To see a window screen in your dream implies that you are being cautiously optimistic about the good news you receive. Interpretation: T

In [43]:
import random
from transformers import pipeline

# Load the fine-tuned DistilBERT model for masked word prediction
mask_filler = pipeline("fill-mask", model="./distilbert_finetuned_dreams", tokenizer="./distilbert_finetuned_dreams")

# 🔹 Retrieve the original text dataset before tokenization
train_texts = train_df["text"].tolist()  # Get original train dreams
val_texts = val_df["text"].tolist()  # Get original validation (test) dreams

# 🔹 Select 2 random dreams from the validation dataset (test set)
sample_test_dreams = random.sample(val_texts, 2)

# 🔹 Select 2 random dreams from the training dataset (train set)
sample_train_dreams = random.sample(train_texts, 2)

# Combine both lists
sample_dreams = sample_test_dreams + sample_train_dreams

# 🔹 Function to predict missing words in a dream interpretation
def predict_interpretation(dream_text):
    """
    Uses the fine-tuned DistilBERT model to predict missing words in a dream interpretation.
    """
    masked_text = f"Dream: {dream_text}. Freud would say this dream symbolizes [MASK], possibly connected to [MASK] feelings of [MASK]."


    # Generate predictions for masked words (without top_p or temperature)
    predictions = mask_filler(masked_text)

    # Construct interpretation with predicted words
    filled_text = masked_text
    for prediction in predictions:
        filled_text = filled_text.replace("[MASK]", prediction[0]['token_str'], 1)

    return filled_text

# Process and print results for all selected dreams
for i, dream in enumerate(sample_dreams):
    interpretation = predict_interpretation(dream)
    dataset_type = "Test" if i < 2 else "Train"  # First 2 from Test, Last 2 from Train
    print(f"({dataset_type} Set) Dream: {dream}")
    print(f"Predicted Interpretation: {interpretation}")
    print("-" * 50)


Device set to use cpu


(Test Set) Dream: Dream: Dartboard Interpretation: To see a dartboard in your dream indicates that you are feeling hostility from someone. You need to express your anger and feelings more directly. Alternatively, the dartboard may symbolize a goal that you are aiming for. You need to try and take a shot at something new and overcome your fear of failure.
Predicted Interpretation: Dream: Dream: Dartboard Interpretation: To see a dartboard in your dream indicates that you are feeling hostility from someone. You need to express your anger and feelings more directly. Alternatively, the dartboard may symbolize a goal that you are aiming for. You need to try and take a shot at something new and overcome your fear of failure.. Freud would say this dream symbolizes anger, possibly connected to your feelings of anger.
--------------------------------------------------
(Test Set) Dream: Dream: ater
Tots Interpretation: To see or eat tater tots in your dream implies that you are focusing too much

In [12]:
import random
from transformers import pipeline

# Load the fine-tuned DistilBERT model for masked word prediction
mask_filler = pipeline("fill-mask", model="./distilbert_finetuned_dreams", tokenizer="./distilbert_finetuned_dreams")

# 🔹 Retrieve the original text dataset before tokenization
train_texts = train_df["text"].tolist()  # Get original train dreams
val_texts = val_df["text"].tolist()  # Get original validation (test) dreams

# 🔹 Select 2 random dreams from the validation dataset (test set)
sample_test_dreams = random.sample(val_texts, 2)

# 🔹 Select 2 random dreams from the training dataset (train set)
sample_train_dreams = random.sample(train_texts, 2)

samples_own=[
    "I was flying high above the clouds",
    "I lost all my teeth",
    "I was being chased by a lion",
    "I was underwater in a deep ocean",
    "I found a hidden treasure chest"
    "I see baby open pandora box"
]

# Combine both lists
sample_dreams = sample_test_dreams + sample_train_dreams + samples_own

# 🔹 Function to predict missing words in a dream interpretation
def predict_interpretation(dream_text):
    """
    Uses the fine-tuned DistilBERT model to predict missing words in a dream interpretation.
    """
    masked_text = f"Dream: {dream_text}. Freud would say this dream symbolizes [MASK], possibly connected to [MASK] feelings of [MASK]."


    # Generate predictions for masked words (without top_p or temperature)
    predictions = mask_filler(masked_text)

    # Construct interpretation with predicted words
    filled_text = masked_text
    for prediction in predictions:
        filled_text = filled_text.replace("[MASK]", prediction[0]['token_str'], 1)

    return filled_text

# Process and print results for all selected dreams
for i, dream in enumerate(sample_dreams):
    interpretation = predict_interpretation(dream)
    dataset_type = "Test" if i < 2 else "Train"  # First 2 from Test, Last 2 from Train
    print(f"({dataset_type} Set) Dream: {dream}")
    print(f"Predicted Interpretation: {interpretation}")
    print("-" * 50)


Device set to use cpu


(Test Set) Dream: Dream: Babysitter Interpretation: To dream that you are babysitting suggests that you need to care for the child within yourself.
Predicted Interpretation: Dream: Dream: Babysitter Interpretation: To dream that you are babysitting suggests that you need to care for the child within yourself.. Freud would say this dream symbolizes sadness, possibly connected to your feelings of abandonment.
--------------------------------------------------
(Test Set) Dream: Dream: Elf Interpretation: To see an elf in your dream refers to some imbalance and disharmony in your life. The elf often serves as a guide of the soul. Alternatively, it suggests that you need to be more carefree, worry-free, and light-hearted.
Predicted Interpretation: Dream: Dream: Elf Interpretation: To see an elf in your dream refers to some imbalance and disharmony in your life. The elf often serves as a guide of the soul. Alternatively, it suggests that you need to be more carefree, worry-free, and light-he

In [10]:
import random

# Retrieve original dream texts **before tokenization**
validation_dreams = val_df["text"].tolist()  # Get dream texts from validation dataframe

# Select 3 random dreams from the validation set
sample_dreams = [
    "I was flying high above the clouds",
    "I lost all my teeth",
    "I was being chased by a lion",
    "I was underwater in a deep ocean",
    "I found a hidden treasure chest"
]

# Generate interpretations for the selected dreams
for dream in sample_dreams:
    interpretation = predict_interpretation(dream)  # Function defined earlier
    print(f"Dream: {dream}")
    print(f"Predicted Interpretation: {interpretation}")
    print("-" * 50)


TypeError: predict_interpretation() missing 3 required positional arguments: 'tokenizer', 'device', and 'dream_input'

In [None]:
# %% Generate Dream Interpretation for Sample Inputs
def generate_interpretation(model, tokenizer, device, dream_input, max_length=128):
    model.eval()  # Set model to evaluation mode
    # Tokenize the input dream
    input_text = f"Dream: {dream_input}\nInterpretation:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

    # Generate the interpretation (output text)
    with torch.no_grad():
        generated_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=256,  # Maximum length of the generated interpretation
            num_beams=4,  # Beam search for more diverse outputs
            no_repeat_ngram_size=2,  # Prevent repetition
            early_stopping=True
        )

    # Decode the generated ids back to text
    interpretation = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return interpretation

# %% Example input dreams
sample_dreams = [
    "I was flying high above the clouds",
    "I lost all my teeth",
    "I was being chased by a lion",
    "I was underwater in a deep ocean",
    "I found a hidden treasure chest"
]

# %% Generate and display interpretations for sample dreams
for dream in sample_dreams:
    interpretation = generate_interpretation(model, tokenizer, device, dream)
    print(f"Dream: {dream}")
    print(f"Interpretation: {interpretation}")
    print("="*50)