In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# Load and prepare the dataset
df = pd.read_csv('dreams.csv')
data = df[['Dream Symbol', 'Interpretation']].dropna()
train_data, val_data = train_test_split(data, test_size=0.2)

In [5]:
# Tokenizer and model setup
model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



In [6]:
# Tokenization
def tokenize_data(examples):
    inputs = ["translate dream to interpretation: " + ex for ex in examples['Dream Symbol']]
    outputs = [ex for ex in examples['Interpretation']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)

    labels = tokenizer(outputs, max_length=128, truncation=True, padding=True).input_ids
    model_inputs["Interpretation"] = labels
    return model_inputs

In [9]:
def tokenize_data(examples):
    inputs = ["translate dream to interpretation: " + ex for ex in examples['Dream Symbol']]
    outputs = [ex for ex in examples['Interpretation']]
    
    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, max_length=128, truncation=True, padding=True).input_ids
    
    # Replace padding token id with -100 to ignore it in loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    
    model_inputs["labels"] = labels
    return model_inputs

In [11]:
# Convert to Dataset object
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_data)
dataset_val = Dataset.from_pandas(val_data)

tokenized_train = dataset_train.map(tokenize_data, batched=True)
tokenized_val = dataset_val.map(tokenize_data, batched=True)

Map:   0%|          | 0/721 [00:00<?, ? examples/s]



Map:   0%|          | 0/181 [00:00<?, ? examples/s]

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)



In [13]:
import torch
import transformers
import accelerate

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

Torch version: 2.5.1
Transformers version: 4.46.1
Accelerate version: 1.3.0


In [14]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [10]:
# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./dream_model")
tokenizer.save_pretrained("./dream_model")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,3.2545,3.053847
2,3.0436,2.773596
3,2.8697,2.69433
4,2.7137,2.645114
5,2.7229,2.610678
6,2.7031,2.589275
7,2.6514,2.578536
8,2.6595,2.569103
9,2.5892,2.562772
10,2.577,2.560693


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./dream_model\\tokenizer_config.json',
 './dream_model\\special_tokens_map.json',
 './dream_model\\tokenizer.json')

In [17]:
# Inference example
def generate_interpretation(dream_text):
    input_ids = tokenizer("interperate dream: " + dream_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=100, temperature=0.7, top_k=50, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [19]:
# Example usage
dream = "I dreamed that i am a big dog"
interpretation = generate_interpretation(dream)
print("Dream interpretation:", interpretation)



Dream interpretation: dream: I dreamed that i am a big dog


In [13]:
def generate_interpretation(dream_text):
    # Making the prompt even more specific and guiding the model to provide a comprehensive Freudian analysis
    prompt = (f"Provide a detailed Freudian interpretation of the following dream: '{dream_text}'. "
              "Explain the psychological meaning behind the dream, considering Freudian concepts like "
              "wish fulfillment, repressed desires, or unconscious thoughts. Describe the symbolism of the elements "
              "in the dream and how they relate to the dreamer's psyche.")
    # Generate the interpretation
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=250, temperature=0.8, top_k=50, num_return_sequences=1)
    
    # Decoding the output and cleaning it up
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean up the output to focus on just the interpretation
    interpretation = decoded_output.split(":")[-1].strip()  # Ensuring we get the actual interpretation
    
    return interpretation

# Example usage
dream = "I dreamed that i am a big dog"
interpretation = generate_interpretation(dream)
print("Dream interpretation:", interpretation)




Dream interpretation: 'I dreamed that i am a big dog'. Explain the psychological meaning behind the dream, considering Freudian concepts like wish fulfillment, repressed desires, or unconscious thoughts. Explain the symbolism of the elements in the dream and how they relate to the dreamer's psyche.


In [48]:
import pandas as pd

# Load the Excel file containing dreams and interpretations
file_path = 'dreams_and_interpretations_Freud.xlsx'  # Replace with your file path if needed
data = pd.read_excel(file_path)

# Extract the necessary columns
dreams = data['Dream']
original_interpretations = data['Interpretation']

# Apply the custom function to each dream
generated_interpretations = dreams.apply(generate_interpretation)

# Combine the original and generated interpretations
output_df = pd.DataFrame({
    'Original_Interpretation': original_interpretations,
    'Generated_Interpretation': generated_interpretations
})

# Save the output to a new CSV file
output_file_path = 'generated_interpretations.csv'
output_df.to_csv(output_file_path, index=False)

print(f"Generated CSV saved to: {output_file_path}")




Generated CSV saved to: generated_interpretations.csv


In [51]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from bert_score import score as bert_score



ModuleNotFoundError: No module named 'rouge'

In [None]:

# Define the function to calculate perplexity using GPT-2
def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return 2 ** loss



In [49]:
# Load data
data = pd.read_excel('generated_interpretations.xls')

# Extract columns
original_interpretations = data['Original_Interpretation']
generated_interpretations = data['Generated_Interpretation']

# Initialize models and metrics
rouge = Rouge()
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Compute metrics for each row
metrics_data = []
for original, generated in zip(original_interpretations, generated_interpretations):
    bleu = sentence_bleu([original.split()], generated.split())
    rouge_scores = rouge.get_scores(generated, original)[0]
    perplexity = calculate_perplexity(generated)
    bert_p, bert_r, bert_f = bert_score([generated], [original], lang='en')

    metrics_data.append({
        'Original': original,
        'Generated': generated,
        'BLEU': bleu,
        'ROUGE_L': rouge_scores['rouge-l']['f'],
        'Perplexity': perplexity,
        'BERTScore': bert_f.mean().item()
    })

# Save results to a CSV file
metrics_df = pd.DataFrame(metrics_data)
metrics_df.to_csv('dreams_interpretation_metrics.csv', index=False)

print("Metrics saved to 'dreams_interpretation_metrics.csv'")


KeyboardInterrupt

