In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

print(torch.__version__)  # Check PyTorch version
print(torch.cuda.is_available())  # Should return True if CUDA is working
print(torch.cuda.get_device_name(0))  # Check your GPU

2.6.0+cu118
True
Quadro RTX 3000


In [3]:
# Load and prepare the dataset
df = pd.read_csv('dreams_interpretations.csv')
data = df[['Dream Symbol', 'Interpretation']].dropna()
train_data, val_data = train_test_split(data, test_size=0.2)

data.head()



Unnamed: 0,Dream Symbol,Interpretation
0,Aardvark,To see an aardvark in your dream indicates tha...
1,Abandonment,To dream that you are abandoned suggests that ...
2,Abduction,To dream of being abducted indicates that you ...
3,Aborigine,To see an Aborigine in your dream represents b...
4,Abortion,To dream that you have an abortion suggests th...


In [4]:
class DreamDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f"Interpret the dream: {row['Dream Symbol']}"
        target_text = row['Interpretation']

        # Tokenize input and label
        encoding = self.tokenizer(
            input_text, 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            target_text, 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': target_encoding['input_ids'].squeeze(0)
        }

In [5]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


# Split dataset
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
train_dataset = DreamDataset(train_data, tokenizer)
val_dataset = DreamDataset(val_data, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



# %%
# Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Label smoothing loss (helps with overfitting)
def compute_loss(logits, labels):
    loss_fct = torch.nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=tokenizer.pad_token_id)
    return loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

In [10]:
from tqdm import tqdm

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = compute_loss(outputs.logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} finished, avg loss: {total_loss / len(train_loader)}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = compute_loss(outputs.logits, labels)
            val_loss += loss.item()

    print(f"Validation Loss: {val_loss / len(val_loader)}")


Epoch 1: 100%|██████████| 181/181 [30:30<00:00, 10.12s/it, loss=3.52]


Epoch 1 finished, avg loss: 4.2320302955353455
Validation Loss: 3.6747285231300024


Epoch 2: 100%|██████████| 181/181 [52:58<00:00, 17.56s/it, loss=3.01]


Epoch 2 finished, avg loss: 3.7265347683627303
Validation Loss: 3.5500590645748638


Epoch 3: 100%|██████████| 181/181 [52:39<00:00, 17.46s/it, loss=2.71]


Epoch 3 finished, avg loss: 3.571800096258933
Validation Loss: 3.4803445235542627


In [11]:
model.save_pretrained("dream_interpreter_t5")
tokenizer.save_pretrained("dream_interpreter_t5")


('dream_interpreter_t5\\tokenizer_config.json',
 'dream_interpreter_t5\\special_tokens_map.json',
 'dream_interpreter_t5\\spiece.model',
 'dream_interpreter_t5\\added_tokens.json')

In [None]:
# %% Evaluation function
def evaluate_model(model, val_loader, device):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Turn off gradient tracking
        for batch in tqdm(val_loader, desc="Evaluating", ncols=100):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Get predictions (take argmax for sequence generation tasks)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            # Store predictions and true labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # You can also compute metrics like BLEU score or others
    return avg_val_loss, all_predictions, all_labels



# %% Evaluate on validation set
avg_val_loss, all_predictions, all_labels = evaluate_model(model, val_loader, device)

# %% Example of comparing predictions and labels
print(f"Example predictions: {all_predictions[:5]}")
print(f"Example true labels: {all_labels[:5]}")

# Optionally, calculate other metrics like BLEU score, accuracy, etc.

from nltk.translate.bleu_score import corpus_bleu

# Calculate BLEU score (use this if your task involves sequence generation)
def calculate_bleu_score(predictions, labels):
    # Convert predictions and labels to a list of lists (for BLEU scoring)
    ref = [[label.split()] for label in labels]  # Convert true labels to list of words
    pred = [prediction.split() for prediction in predictions]  # Convert predictions to list of words

    return corpus_bleu(ref, pred)

# Example usage of BLEU score calculation
bleu_score = calculate_bleu_score(all_predictions, all_labels)
print(f"BLEU score: {bleu_score:.4f}")


In [12]:
# %% Generate Dream Interpretation for Sample Inputs
def generate_interpretation(model, tokenizer, device, dream_input, max_length=128):
    model.eval()  # Set model to evaluation mode
    # Tokenize the input dream
    input_text = f"Dream: {dream_input}\nInterpretation:"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

    # Generate the interpretation (output text)
    with torch.no_grad():
        generated_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=256,  # Maximum length of the generated interpretation
            num_beams=4,  # Beam search for more diverse outputs
            no_repeat_ngram_size=2,  # Prevent repetition
            early_stopping=True
        )

    # Decode the generated ids back to text
    interpretation = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return interpretation

# %% Example input dreams
sample_dreams = [
    "I was flying high above the clouds",
    "I lost all my teeth",
    "I was being chased by a lion",
    "I was underwater in a deep ocean",
    "I found a hidden treasure chest"
]

# %% Generate and display interpretations for sample dreams
for dream in sample_dreams:
    interpretation = generate_interpretation(model, tokenizer, device, dream)
    print(f"Dream: {dream}")
    print(f"Interpretation: {interpretation}")
    print("="*50)


Dream: I was flying high above the clouds
Interpretation: To dream that you were flying high above the clouds symbolizes your desire to be in control of your life.
Dream: I lost all my teeth
Interpretation: To dream that you have lost all your teeth represents a loss of self-esteem and self esteem.
Dream: I was being chased by a lion
Interpretation: To see or be chased by a lion in your dream represents your desire to be in control of your life.
Dream: I was underwater in a deep ocean
Interpretation: To dream that I was underwater in a deep ocean indicates that you need to be more aware of your surroundings.
Dream: I found a hidden treasure chest
Interpretation: To dream that you have found a hidden treasure chest suggests that there is something you need to find.
