In [1]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset,DatasetDict
from sklearn.model_selection import train_test_split



print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print("Number of GPUs:", torch.cuda.device_count())


  from .autonotebook import tqdm as notebook_tqdm


CUDA Available: True
CUDA Version: 11.8
GPU Name: NVIDIA GeForce MX450
Number of GPUs: 1


In [2]:

def load_data(file_path):
    df = pd.read_csv(file_path)
    df["text"] = "Dream: " + df["Dream Symbol"] + "\nInterpretation: " + df["Interpretation"]
    
    # Print the first 5 rows
    print(df.head())  # This will display the first 5 rows of the DataFrame
    
    return Dataset.from_pandas(df[["text"]])

# Load dataset
dataset_path = r"C:\\Users\\97254\\Desktop\\niv\\Github projects\\NLP-Final-Project---Dreams-Interpreter\\DREAMS DATA\\dreams_interpretations.csv"
df = load_data(dataset_path)


# Split into training and validation sets (80% train, 20% validation)
train_texts, val_texts = train_test_split(df["text"], test_size=0.2, random_state=42)

# Convert back to DataFrame format
train_df = pd.DataFrame(train_texts, columns=["text"])
val_df = pd.DataFrame(val_texts, columns=["text"])

# Convert to Hugging Face Dataset format
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})


  Dream Symbol                                     Interpretation  \
0     Aardvark  To see an aardvark in your dream indicates tha...   
1  Abandonment  To dream that you are abandoned suggests that ...   
2    Abduction  To dream of being abducted indicates that you ...   
3    Aborigine  To see an Aborigine in your dream represents b...   
4     Abortion  To dream that you have an abortion suggests th...   

                                                text  
0  Dream: Aardvark\nInterpretation: To see an aar...  
1  Dream: Abandonment\nInterpretation: To dream t...  
2  Dream: Abduction\nInterpretation: To dream of ...  
3  Dream: Aborigine\nInterpretation: To see an Ab...  
4  Dream: Abortion\nInterpretation: To dream that...  


In [3]:

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token, use EOS instead

model = GPT2LMHeadModel.from_pretrained("gpt2")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./dream_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)   
trainer.train()


# Save model
model_save_checkpoints = "./dream_model_gpt2_split"
model.save_pretrained(model_save_checkpoints)
tokenizer.save_pretrained(model_save_checkpoints)

print(f"Training complete! Model saved to {model_save_checkpoints}")


Map: 100%|██████████| 721/721 [00:00<00:00, 754.52 examples/s]
Map: 100%|██████████| 181/181 [00:00<00:00, 480.99 examples/s]
  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the trained model and tokenizer
model_path = model_save_checkpoints
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [2]:
def generate_interpretation(dream_symbol, max_length=50):
    # Format input as it was trained
    input_text = f"Dream: {dream_symbol}\nInterpretation:"
    
    # Tokenize input
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate interpretation using the model
    output = model.generate(
        input_ids, 
        max_length=max_length, 
        num_return_sequences=1,  # Generate one interpretation
        temperature=0.7,  # Controls randomness (lower = more deterministic)
        top_k=50,  # Limits to top 50 tokens to reduce randomness
        top_p=0.95,  # Nucleus sampling (higher = more random)
        do_sample=True  # Enable sampling for diverse outputs
    )

    # Decode and return the generated text
    interpretation = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return interpretation


In [3]:
dream_examples = [
    "Flying",
    "Snake",
    "Lost in a city",
    "Being chased",
    "Seeing a black cat"
]

for dream in dream_examples:
    interpretation = generate_interpretation(dream)
    print(f"Dream: {dream}\n{interpretation}\n{'-'*50}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dream: Flying
Dream: Flying
Interpretation: To dream that you are flying suggests that you are experiencing some sort of turbulence or stress in your life. The dream may also be a metaphor for something that is about to happen to you or someone you care about
--------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dream: Snake
Dream: Snake
Interpretation: To see a snake in your dream represents a nervous breakdown or problem.  Alternatively, the snake symbolizes innocence, purity, and purity. The dream may also be a pun on the "moody snake
--------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dream: Lost in a city
Dream: Lost in a city
Interpretation: To see or dream that you are lost in a city indicates that you are not taking responsibility for your actions or how you are going about your daily life. You are trying to escape from reality.
--------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dream: Being chased
Dream: Being chased
Interpretation: To dream that you are chased indicates that you are afraid to go through with something. You are afraid to let go of the situation.  It may also mean that you are afraid to let go of things
--------------------------------------------------
Dream: Seeing a black cat
Dream: Seeing a black cat
Interpretation: To see a black cat in your dream symbolizes the blackness of the human being and the darkness in your soul. It may also signify some sort of dark inner turmoil. Alternatively, the dream
--------------------------------------------------
