In [12]:
FEW_SHOT_PROMPT = """
You are an expert story reviewer. Rate the following short fictional story on a scale of 1 to 5 based on,Evaluate the story below and provide JSON-formatted feedback with:

1. Coherence – Clear beginning, middle, and end.
2. Creativity – Original or interesting concept.
3. Language Fluency – Well-written and grammatically correct.

Do NOT compare this to novels or professional literature. Treat it as a short story by a beginner writer or AI.

Example Response:
{{
  "rating": 4,
  "justification": "Good keyword usage but could improve character development",
  "missing_keywords": ["Grandma"]
}}

Example:
keywords: ["Mathville", "puzzles", "riddles", "equations", "nature"]
script: [Narrator](neutral): In the small town of Mathville, lived two best friends, Algy and Sammy. They loved solving puzzles and riddles together. One sunny day, they found a mysterious note on our favorite park bench! It read, \u201cSolve -1147 = -11*a - 1213 for \u2018a\u2019. What is the answer?
[Sammy](surprised): Who wrote this? And why should we care?
[Algy](calm): No worries, Sammy. I'll take it on. Let's get to work! First, we need to get \u2018a\u2019 alone. To do that, let\u2019s add 1213 to both sides of the equation.
[Sammy](curious): And then what?
[Algy](excited): Next, we simplify both sides to make our work easier! And voila! Both sides become 66.
[Sammy](confused): But wait, won\u2019t negative divided by negative give us positive? Why would we want to use minus here?
[Algy](smiling): Great question, Sammy! Yes, usually, negative divided by negative gives us positive. But when you divide a number by itself \u2013 no matter its sign \u2013 the result will always be 1 or, in this case, -1 since we have a negative number.
[Narrator](calm): After dividing, they discovered that 'a' was equal to -6.
[Sammy](amazed): Whoa! The numbers around us are changing!
[Algy](surprised): What's going on? Trees are turning into formulas, flowers into equations... even birds are chirping out integers!
[Narrator](neutral): As they were about to celebrate their success, this bewildering spectacle made everyone realize how deeply intertwined math was with nature.\n\n[Sammy](reflective): Sometimes, unveiling answers leads us to surprising realizations. Though things didn't go back to normal immediately, we learned an essential lesson: Every discovery brings new wonders worth exploring."
Rating: 5
Justification: The story is simple but creative, flows logically, and is grammatically correct.

Now rate this story:
keywords: {keywords}
script: {story}

Your JSON analysis:
"""

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

# --- STORY SETUP ---
keywords = ["Friendly Ghost", "Scared Girl", "Graveyard", "Grandma", "Friends"]
story = """
[Narrator](neutral): Once upon a time, there was a friendly ghost...
[Friendly Ghost](calm): I love playing around the graveyard.
[Narrator](neutral): One day, he met a scared little girl...
[Friendly Ghost](surprised): Ooh, you're so scared!
[Scared Little Girl](neutral): I want to go home.
[Friendly Ghost](friendly): I'll take you with me.
[Narrator](neutral): Together, they explored the graveyard...
[Friendly Ghost](neutral): Let's hide and play some more.
[Narrator](neutral): After a while, the ghost took the girl home...
[Friendly Ghost](calm): I'll stay by her side, just in case.
[Narrator](neutral): From then on, the ghost would visit the graveyard with his friend...
[Friendly Ghost](happy): We'll play together and have fun.
"""

def safe_generate(model, tokenizer, prompt, max_input_len=1024, max_output_len=150):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_len, padding=True)
    input_ids = inputs.input_ids
    input_ids = input_ids[:, :max_input_len]  # Ensure strict truncation
    outputs = model.generate(input_ids=input_ids, max_new_tokens=max_output_len, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

def extract_json_from_output(text):
    try:
        # Find start and end of JSON
        start = text.find('{')
        end = text.find('}', start) + 1
        return json.loads(text[start:end])
    except Exception:
        return {"rating": 3, "justification": "⚠️ Could not parse output properly."}

def evaluate_model(model_name, prompt, keywords):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name) if "gpt2" in model_name else AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    output = safe_generate(model, tokenizer, prompt)
    parsed = extract_json_from_output(output)
    parsed["missing_keywords"] = check_missing_keywords(keywords, story)
    return parsed



# --- UTILITIES ---
def print_colored(title, content):
    colors = {'blue': '\033[94m', 'green': '\033[92m', 'end': '\033[0m'}
    print(f"\n{colors['blue']}=== {title} ==={colors['end']}")
    print(content)
    print(f"{colors['blue']}==================={colors['end']}\n")

def parse_json_response(text):
    try:
        start = text.find('{')
        end = text.rfind('}') + 1
        if start == -1 or end == 0:
            raise ValueError("No JSON found in response")
        return json.loads(text[start:end])
    except Exception as e:
        print(f"JSON Parse Error: {str(e)}")
        return {"rating": 3, "justification": "Response format error"}

# --- MODEL EVALUATORS ---
def evaluate_with_model(prompt, model_name, model_type, is_seq2seq=False):
    try:
        # Model loading
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Handle padding token for models that don't have it
        if not is_seq2seq and tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token  # Fix for GPT-2
        
        model_class = AutoModelForSeq2SeqLM if is_seq2seq else AutoModelForCausalLM
        model = model_class.from_pretrained(model_name)
        
        # Tokenization
        if is_seq2seq:
            inputs = tokenizer("Evaluate: " + prompt, return_tensors="pt", 
                             max_length=1024, truncation=True, padding=True)
        else:
            inputs = tokenizer(prompt, return_tensors="pt", 
                             max_length=1024, truncation=True, padding=True)
        
        # Generation
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id  # Use configured pad token
        )
        
        # Decoding
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print_colored(f"{model_type.upper()} RAW OUTPUT", decoded_output)
        
        return parse_json_response(decoded_output)
        
    except Exception as e:
        print(f"{model_type} Error: {str(e)}")
        return {"rating": 3, "justification": "Model evaluation failed"}

# --- MAIN EXECUTION ---
def main():
    # Prepare prompt
    prompt = FEW_SHOT_PROMPT.format(keywords=keywords, story=story.strip())
    print_colored("EVALUATION PROMPT", prompt)
    
    # Run evaluations
    results = {
        "GPT-2": evaluate_with_model(prompt, "gpt2", "gpt-2"),
        "BART": evaluate_with_model(prompt, "facebook/bart-large", "bart", is_seq2seq=True),
        "T5": evaluate_with_model(prompt, "t5-large", "t5", is_seq2seq=True)
    }
    
    # Calculate average
    ratings = [v['rating'] for v in results.values()]
    average = sum(ratings) / len(ratings)
    
    # Display results
    print_colored("EVALUATION RESULTS", json.dumps(results, indent=2))
    print_colored("FINAL SCORE", f"Average Rating: {average:.2f}/5.00")

if __name__ == "__main__":
    main()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


🎯 Individual Ratings:
🤖 GPT-2: {'rating': 4, 'justification': 'Effective use of most keywords, but character development could be stronger', 'missing_keywords': ['Grandma']}
🧠 BART: {'rating': 4, 'justification': 'Effective use of most keywords, but character development could be stronger', 'missing_keywords': ['Grandma']}
🔧 T5: {'rating': 3, 'justification': '⚠️ Could not parse output properly.'}

✅ Average Rating: 3.67
