In [2]:
import pandas as pd
import torch
from transformers import BertForMaskedLM, BertTokenizer


ModuleNotFoundError: No module named 'metrics_calc'

In [3]:
# Check if GPU (CUDA) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load trained BERT model and tokenizer
model_path = "./bert_large_finetuned_dreams"
model = BertForMaskedLM.from_pretrained(model_path).to(device)  # Move model to GPU if available
tokenizer = BertTokenizer.from_pretrained(model_path)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Using device: cpu


In [4]:
# Read validation dataset (Excel file)
input_excel = r"C:\קבצים אחרונים לגיבוי\NLP-Final-Project---Dreams-Interpreter\DREAMS DATA\dreams_and_interpretations_Freud.xlsx"  # Replace with your actual Excel file name
df = pd.read_excel(input_excel)


# Ensure the dataset has a 'Dream' column
if "Dream" not in df.columns:
    raise ValueError("CSV file must contain a 'Dream' column.")


In [8]:
from metrics_calculation import metrics_calc
import torch

# Function to generate interpretations using BERT masked language model
def generate_interpretation(dream):
    input_text = f"Dream: {dream} Interpretation: {tokenizer.mask_token}."  # Ensure [MASK] token is correctly inserted

    # Tokenize and move input tensors to device
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate masked word prediction
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits

    # Identify the masked token position
    masked_index = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    if len(masked_index) == 0:
        return "No masked token found."

    # Predict the most likely token for [MASK]
    predicted_token_id = torch.argmax(predictions[0, masked_index, :], dim=-1)
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id[0].item())

    # Replace the mask with the predicted word
    return input_text.replace(tokenizer.mask_token, predicted_token)

# Generate interpretations
df["Generated_Interpretation"] = df["Dream"].apply(generate_interpretation)

# Save results to CSV (ensure correct file path formatting)
output_csv = r"./bert_large_generated_interpretations.csv"
df.to_csv(output_csv, index=False)

# Define output file for metrics results
output_metrics_csv = r"./bert_large_metrics_results.csv"

# Evaluate text metrics
metrics_calc.evaluate_text_metrics(tokenizer, model, output_csv, output_metrics_csv)

print(f"Evaluation metrics saved to {output_metrics_csv}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Metrics saved to './bert_large_metrics_results.csv'
Evaluation metrics saved to ./bert_large_metrics_results.csv


In [10]:
import metrics_calc
import os

# Define paths for evaluation metrics
output_dir = r"C:\קבצים אחרונים לגיבוי\NLP-Final-Project---Dreams-Interpreter\Barak model\Bert-Model\output_bert"
output_metrics_csv = os.path.join(output_dir, "bert_metrics_results.csv")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Evaluate text metrics and save results
metrics_calc.evaluate_text_metrics(tokenizer, model, output_csv, output_metrics_csv)

print(f"Evaluation metrics saved to: {output_metrics_csv}")



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Metrics saved to 'C:\קבצים אחרונים לגיבוי\NLP-Final-Project---Dreams-Interpreter\Barak model\Bert-Model\output_bert\bert_metrics_results.csv'
Evaluation metrics saved to: C:\קבצים אחרונים לגיבוי\NLP-Final-Project---Dreams-Interpreter\Barak model\Bert-Model\output_bert\bert_metrics_results.csv


In [10]:
# Function to perform masked word prediction
def predict_masked_text(text):
    tokens = tokenizer(text, return_tensors="pt")
    masked_index = torch.where(tokens.input_ids == tokenizer.mask_token_id)[1]

    if len(masked_index) == 0:
        return "No masked token found in input."

    with torch.no_grad():
        outputs = model(**tokens)
        predictions = outputs.logits

    predicted_token_id = torch.argmax(predictions[0, masked_index, :], dim=-1)
    predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id[0].item())

    return text.replace(tokenizer.mask_token, predicted_token)

# Example masked sentence
example_text = "Dream: water Interpretation: [MASK] calm emotions."
predicted_sentence = predict_masked_text(example_text)
print(f"Predicted Sentence: {predicted_sentence}")


Predicted Sentence: Dream: water Interpretation: to calm emotions.
