In [4]:
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import precision_score

# 1. Perplexity

def calculate_perplexity(model, tokenizer, text):
    """Calculates the perplexity of a given text using a language model."""
    encodings = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(encodings['input_ids'], labels=encodings['input_ids'])
    loss = outputs.loss
    perplexity = torch.exp(loss)
    return perplexity.item()


# 2. BLEU Score

def calculate_bleu(reference_text, generated_text):
    """Calculates the BLEU score between a reference and generated text."""
    reference_tokens = reference_text.split()  # Simple tokenization
    generated_tokens = generated_text.split()
    # Smoothing function to handle cases with few or no matching n-grams
    smoothing = SmoothingFunction().method4  
    score = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)
    return score


# 3. Context Precision (Simplified Example)

def calculate_context_precision(context, generated_text, relevant_keywords):
    """Calculates context precision based on keyword overlap."""
    context_keywords = set(context.split())  # Simplified keyword extraction
    generated_keywords = set(generated_text.split()) # Simplified keyword extraction

    relevant_keywords_set = set(relevant_keywords)

    # Intersection of generated keywords with relevant keywords within the context
    relevant_generated_keywords = generated_keywords.intersection(relevant_keywords_set).intersection(context_keywords)

    if len(generated_keywords) == 0:  # Avoid division by zero
        return 0.0
    
    precision = len(relevant_generated_keywords) / len(generated_keywords)
    return precision



# Example Usage and Improvement Loop

# Load pre-trained model and tokenizer
model_name = "gpt2"  # Or any other suitable model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Example Data (Replace with your actual data)
context = "The cat sat on the mat."
reference_text = "The cat sat on the mat and purred."
relevant_keywords = ["cat", "mat", "sat"]  # Keywords relevant to the context


# Initial Evaluation
generated_text = "The cat sat on the mat."  # Example generated text
perplexity = calculate_perplexity(model, tokenizer, context + " " + generated_text)
bleu_score = calculate_bleu(reference_text, generated_text)
context_precision = calculate_context_precision(context, generated_text, relevant_keywords)

print(f"Initial Perplexity: {perplexity}")
print(f"Initial BLEU Score: {bleu_score}")
print(f"Initial Context Precision: {context_precision}")



# Improvement Loop (Simplified - In real scenarios, you'd fine-tune)
for i in range(2): # Example iterations
    # 1.  (Simplified) Text Generation Change:  Introduce slight variation
    if i == 0:
        generated_text = "The cat sat on a mat."  # Slight change
    elif i == 1:
      generated_text = "A cat sat on the mat." # Another change

    # 2. Re-evaluate
    perplexity = calculate_perplexity(model, tokenizer, context + " " + generated_text)
    bleu_score = calculate_bleu(reference_text, generated_text)
    context_precision = calculate_context_precision(context, generated_text, relevant_keywords)

    print(f"\nIteration {i+1}:")
    print(f"Generated Text: {generated_text}")
    print(f"Perplexity: {perplexity}")
    print(f"BLEU Score: {bleu_score}")
    print(f"Context Precision: {context_precision}")


    # 3. (In a real scenario) Fine-tuning:  You would use the metrics to guide fine-tuning 
    #    to improve the model's performance.  This is a simplified example.
    #    You might adjust model parameters, training data, etc. based on the metrics.

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Initial Perplexity: 14.812183380126953
Initial BLEU Score: 0.5444460596606694
Initial Context Precision: 0.3333333333333333

Iteration 1:
Generated Text: The cat sat on a mat.
Perplexity: 25.568115234375
BLEU Score: 0.36409302398068727
Context Precision: 0.3333333333333333

Iteration 2:
Generated Text: A cat sat on the mat.
Perplexity: 22.17603302001953
BLEU Score: 0.36409302398068727
Context Precision: 0.3333333333333333
