In [21]:
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import numpy as np
import random
from sklearn.preprocessing import normalize
random.seed(42)
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import dspy
from nltk.stem import PorterStemmer
nltk.download('punkt')
from datasets import Dataset
nltk.download('stopwords')
from transformers import pipeline
import ast
import json


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def create_category_string(category, dictionary):
    string = f"{category}: "
    
    if category in dictionary:
        category_data = dictionary[category]
        if isinstance(category_data, list):
            for item in category_data:
                for k, v in item.items():
                    if k == 'name':  
                        continue
                    if isinstance(v, list):
                        v = ', '.join(v)
                    string += f"{k}: {v} "
        elif isinstance(category_data, dict):
            for k, v in category_data.items():
                if k == 'name': 
                    continue
                if isinstance(v, list):
                    v = ', '.join(v)
                string += f"{k}: {v} "
        else:
            string += f"{category}: {category_data} "
    
    return string



def transform_dict_to_string(data):
    full_string = ""

    for category in data:
        full_string += create_category_string(category, data)
        full_string += " "
    
    return full_string.strip()

In [23]:
from transformers import pipeline
from collections import defaultdict
import re
from tqdm import tqdm

class CharacterAnonymizer:
    def __init__(self, batch_size=16):
        self.ner_pipeline = pipeline(
            "ner",
            model="Jean-Baptiste/roberta-large-ner-english",
            aggregation_strategy="simple",
            device=0 
        )
        self.batch_size = batch_size

    def normalize_name(self, name):
        """Normalize name: lowercase, remove punctuation."""
        name = name.lower()
        name = re.sub(r"[^a-z ]", "", name)
        return name.strip()

    def extract_names_batch(self, texts):
        """Run NER in batch and extract PERSON names per document."""
        ner_results_batch = self.ner_pipeline(texts)
        all_names = []
        for ner_results in ner_results_batch:
            names = [ent['word'] for ent in ner_results if ent['entity_group'] == 'PER']
            normalized = list(set(self.normalize_name(name) for name in names))
            all_names.append(normalized)
        return all_names

    def cluster_names(self, names):
        """Cluster similar names by shared first tokens."""
        clusters = defaultdict(list)
        used = set()

        for name in names:
            if not name.strip():
                continue
            if name in used:
                continue
            parts = name.split()
            if not parts:
                continue
            key = parts[0]
            for other in names:
                if key in other and other not in used:
                    clusters[key].append(other)
                    used.add(other)
        return clusters

    def generate_name_map(self, clusters):
        """Create CharacterN mapping for each name variant."""
        name_map = {}
        for i, (key, variants) in enumerate(clusters.items(), start=1):
            tag = f"Character{i}"
            for name in variants:
                name_map[name] = tag
        return name_map

    def replace_names(self, text, name_map):
        """Replace all name variants in the original text with CharacterN."""
        for original in sorted(name_map.keys(), key=len, reverse=True):
            pattern = re.compile(rf'\b{re.escape(original)}\b', re.IGNORECASE)
            text = pattern.sub(name_map[original], text)
        return text

    def anonymize_batch(self, texts):
        """Anonymize texts in batches with a progress bar."""
        anonymized_texts = []

        for i in tqdm(range(0, len(texts), self.batch_size), desc="Anonymizing"):
            batch = texts[i:i + self.batch_size]
            all_names = self.extract_names_batch(batch)

            for text, names in zip(batch, all_names):
                clusters = self.cluster_names(names)
                name_map = self.generate_name_map(clusters)
                anonymized = self.replace_names(text, name_map)
                anonymized_texts.append(anonymized)

        return anonymized_texts


In [24]:
def preprocess_text(text):
    """Not used for transformers"""
    
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop_words]
    
    ps = PorterStemmer()
    
    tokens = [ps.stem(token) for token in tokens]
    
    return tokens

In [25]:
tell_me_again_df = pd.read_csv("data/tell_me_again.csv")
tell_me_again_df['label'] = tell_me_again_df.groupby('title').ngroup()

In [26]:
summary_lengths = tell_me_again_df['unpacked_summary'].str.len()

# Calculate the 75th and 90th percentiles
lower_bound = summary_lengths.quantile(0.7)
upper_bound = summary_lengths.quantile(0.95)

# Filter the DataFrame
tell_me_again_df = tell_me_again_df[
    (summary_lengths >= lower_bound) & (summary_lengths <= upper_bound)
]


In [27]:
tell_me_again_df['property_count'] = tell_me_again_df['label'].map(tell_me_again_df['label'].value_counts())
tell_me_again_df = tell_me_again_df[tell_me_again_df.property_count >= 3]
tell_me_again_df.shape

(5962, 8)

In [28]:
summary_indices = random.sample(list(set(tell_me_again_df.label)),1100)

In [29]:
tell_me_again_df = tell_me_again_df[tell_me_again_df.label.isin(summary_indices)]

In [11]:
tell_me_again_df = tell_me_again_df.sort_values(by="title").reset_index()
ner_pipeline = pipeline("ner", grouped_entities=True)

dataset = Dataset.from_pandas(tell_me_again_df.sort_values(by="title")[['unpacked_summary']])

#ner_results = ner_pipeline(dataset['unpacked_summary'], batch_size=16)

#tell_me_again_df['unpacked_summary'] = anonymize_with_ner(dataset['unpacked_summary'], ner_results)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

KeyboardInterrupt



# 2. Zero-shot Ollama + DSPy

In [None]:
#anonymizer = CharacterAnonymizer(batch_size=32)
#texts = anonymizer.anonymize_batch(dataset['unpacked_summary'])


In [15]:
from ollama import chat
from pydantic import BaseModel, Field
from typing import Literal, Optional, List
from pydantic import conlist

class Character(BaseModel):
    name: str # removed in post-processing, helps the model establish seperate identities
    role: Optional[Literal["protagonist", "antagonist","love interest","confidant", "deuteragonist", "tertiary character", "foil"]]
    backstory: Optional[str] = Field(description="A brief account of the character's past experiences, upbringing, or defining events that shape their current identity and actions.")
    weaknesses: Optional[list[str]] = Field(description="Flaws, limitations, or vulnerabilities—emotional, physical, or psychological—that challenge the character or hinder their goals.")
    strengths: Optional[list[str]] = Field(description="Key abilities, virtues, or resources—such as intelligence, bravery, charm, or magical powers—that help the character overcome obstacles.")
    motivations: Optional[list[str]] = Field(description="Core desires or driving forces behind the character's decisions and actions, such as revenge, love, freedom, or self-discovery.")

class Setting(BaseModel):
    time_periods: Optional[list[str]] = Field(description="Historical or futuristic eras in which the story takes place (e.g., Victorian England, post-apocalyptic future, 1990s).")
    locations: Optional[list[str]] = Field(description="Specific geographical or fictional places where the story unfolds, such as cities, countries, regions, or imaginary worlds.")
    cultural_context: Optional[list[str]] = Field(description="The societal, historical, or ideological backdrop influencing the story—such as traditions, belief systems, political climates, or social norms.")

class Plot(BaseModel):
    conflict: Optional[Literal["person vs. person", "person vs. self", "person vs. nature", "person vs.society", "person vs. fate", "person vs. machine", "person vs. the unknown"]]
    basic_plot: Optional[Literal["overcoming the Monster", "rags to Riches", "the Quest", "voyage and Return", "comedy", "tragedy", "rebirth"]]                                                                                                                                                                                                                                                                                
    story_exposition_summary: Optional[str] = Field(description="A summary of the story's beginning, introducing the main characters, setting, and the initial situation or status quo.")
    story_rising_action_summary: Optional[str] = Field(description="A summary of the events that build tension and develop the conflict, leading up to the story's climax.")
    story_climax_summary: Optional[str] = Field(description="A summary of the turning point or most intense moment in the story, where the main conflict reaches its peak.")
    story_falling_action_summary: Optional[str] = Field(description="A summary of the events following the climax, showing the consequences of the turning point and moving toward resolution.")
    story_resolution_summary: Optional[str] = Field(description="A summary of how the story concludes, resolving major conflicts and revealing the final outcomes for the characters.")
    
class Theme(BaseModel):
    themes: Optional[list[str]] = Field(description="Central ideas or underlying topics explored throughout the story—such as identity, power, love, or justice.")
    morals: Optional[list[str]] = Field(description="Lessons or ethical takeaways the story imparts to the audience, often reflecting the consequences of characters' choices or actions.")
    
class Other(BaseModel):
    main_genres: Optional[list[Literal["action", "comedy", "drama", "erotic fiction", "horror", "mystery", "period piece", "romance", "thriller"]]]
    genre_keywords: Optional[list[str]] = Field(description="Descriptive terms that indicate sub-genres, tones, or stylistic elements of the story—such as dystopian, noir, slapstick, gothic, or psychological.")
    
class Elements(BaseModel):
    characters: conlist(Character, min_length=0, max_length=5)
    setting: Setting
    plot: Plot
    theme: Theme
    other: Other
    



In [16]:
prompt_additions = {

    "basic_plot": """- overcoming the Monster: The protagonist sets out to defeat an antagonistic force (often evil) which threatens the protagonist and/or protagonist's homeland
- rags to Riches: The poor protagonist acquires things such as power, wealth, and a mate, before losing it all and gaining it back upon growing as a person.
- the Quest: The protagonist and some companions set out to acquire an important object or to get to a location, facing many obstacles and temptations along the way.
- voyage and Return: The protagonist goes to a strange land and, after overcoming the threats it poses to him or her, returns with nothing but experience.
- comedy: Light and humorous character with a happy or cheerful ending; a dramatic work in which the central motif is the triumph over adverse circumstance, resulting in a successful or happy conclusion. Booker makes sure to stress that comedy is more than humor. It refers to a pattern where the conflict becomes more and more confusing, but is at last made plain in a single clarifying event. Most romances fall into this category.
- tragedy: The protagonist is a hero with one major character flaw or great mistake which is ultimately their undoing. Their unfortunate end evokes pity at their folly and the fall of a fundamentally 'good' character.
- rebirth: During the course of the story, an important event forces the main character to change their ways, often making them a better person.""",

    "role": """- protagonist: The main character of the story is the protagonist. They should be carefully crafted with a logical backstory, personal motivation, and a character arc over the course of the story. Often the story will be told from their point of view.
- antagonist: The villain of the story is the antagonist.
- confidant: This type of character is the best friend or sidekick of the protagonist. Often the protagonist's goal flows through the confidant—although not every story needs one.
- deuteragonist: These characters often overlap with confidants. A deuteragonist is close to the main character, but the story’s main plot does not directly correspond with their own character arc.
- tertiary character: Tertiary characters populate the world of the story but do not necessarily link to the main storyline. These minor characters serve any number of functions and may have varying degrees of personal dynamism.
- foil: A foil character primarily exists to bring the protagonist’s qualities into sharper relief. This is because the foil is effectively the opposite of the protagonist.""",

    "conflict": """- person vs. person: The problem is another character
- person vs. self: The problem lies inside the protagonist
- person vs. nature: The problem comes from non-sapient sources
- person vs. fate: The problem is an undesirable destiny, which may also involve divine will.
- person vs. machine:  As in machinery. Most commonly told from the perspective of a worker being replaced by a machine
- person vs. the unknown: Has also been proposed as the type that codifies Horror, where the enemy is the incomprehensible, otherworldly or extraterrestrial."""
    
}

In [17]:
from pydantic import BaseModel, Field
from typing import get_args, get_origin

def extract_field_descriptions(model, parent_key=''):
    field_descriptions = {}
    for name, field in model.model_fields.items():
        full_key = f"{parent_key}.{name}" if parent_key else name

        # Unwrap annotations for Optional, List, etc.
        annotation = field.annotation
        origin = get_origin(annotation)

        if origin in (list, tuple, dict):
            inner_args = get_args(annotation)
            if inner_args:
                annotation = inner_args[0]  # Drill down one level

        # Check if this is a nested Pydantic model
        if isinstance(annotation, type) and issubclass(annotation, BaseModel):
            field_descriptions.update(extract_field_descriptions(annotation, full_key))
        else:
            description = field.description
            if description:
                field_descriptions[full_key] = description

    return field_descriptions

    
field_descriptions = extract_field_descriptions(Elements)

In [18]:
field_desc_str = "\n".join(f"- `{k}`: {v}" for k, v in field_descriptions.items())

In [19]:
prompt_addition_str = ""
for k,v in prompt_additions.items():
    prompt_addition_str += f"{k} information: {v}\n"

In [84]:
tell_me_again_df = pd.read_excel('data/tell_me_again_df_with_elements_v5.xlsx', engine='openpyxl')

In [87]:
json.loads(tell_me_again_df.extracted_elements.iloc[4])

{'characters': [{'name': 'Detective Jang Hae-jun',
   'role': 'protagonist',
   'backstory': 'Insomniac detective working in Pusan, strained marriage with Jeong-an. Obsessive about his work.',
   'weaknesses': ['Insomnia',
    'Emotional vulnerability',
    'Obsessive tendencies',
    'Easily manipulated'],
   'strengths': ['Intelligence', 'Deductive reasoning', 'Perseverance'],
   'motivations': ['Solving cases', 'Finding truth', 'Seeking connection']},
  {'name': 'Seo-rae',
   'role': 'antagonist',
   'backstory': 'Chinese immigrant with a troubled past, killed her mother at her request, inherited land from Korean grandfather. Uses manipulation to achieve her goals.',
   'weaknesses': ['Deceptive', 'Ruthless', 'Emotionally unstable'],
   'strengths': ['Manipulation', 'Resourcefulness', 'Calculative'],
   'motivations': ['Reclaiming inheritance',
    'Control',
    'Seeking a specific partner']},
  {'name': 'Jeong-an',
   'role': 'deuteragonist',
   'backstory': "Hae-jun's wife, works

In [83]:
json.loads(tell_me_again_df.extracted_elements.iloc[1])

{'characters': [{'name': 'Nolan/Thomas',
   'role': 'protagonist',
   'backstory': "Nolan suffered amnesia after a car accident where his wife died. He is later revealed to be Thomas, whose brain waves were implanted into Nolan's body after Thomas's death.",
   'weaknesses': ['Submissive nature',
    'Violent outbursts',
    'Memory loss',
    'Easily manipulated'],
   'strengths': ['Love for his daughter',
    'Former skills as a photojournalist'],
   'motivations': ['Regaining his memory',
    'Protecting his daughter',
    'Making amends for past actions']},
  {'name': 'Gary',
   'role': 'confidant',
   'backstory': "Nolan's best friend and an orthopedic doctor who supports him throughout his ordeal.",
   'weaknesses': ["Concern for his friend's well-being"],
   'strengths': ['Loyalty', 'Medical expertise'],
   'motivations': ['Helping Nolan regain his life', 'Protecting Ava']},
  {'name': 'Dr. Brooks',
   'role': 'antagonist',
   'backstory': "A doctor who saved Thomas's brain wave

In [18]:
dataset = Dataset.from_pandas(tell_me_again_df[['unpacked_summary']])

In [25]:
lst = []
texts = tell_me_again_df['extracted_elements'].tolist()
for i in range(len(texts)):
    try:
        transform_dict_to_string(json.loads(texts[i]))
    except:
        lst.append(i)

In [26]:
len(lst)

0

In [24]:
from tqdm import tqdm
from ollama import chat

n = 1
output_excel_path = 'data/tell_me_again_df_with_elements_v4.xlsx'


if 'extracted_elements' not in tell_me_again_df.columns:
    tell_me_again_df['extracted_elements'] = None


for i in tqdm(lst, desc="Processing remaining texts", unit="text"):
    text = tell_me_again_df.loc[i, 'unpacked_summary']
    try:
        response = chat(
            messages=[
                {
        "role": "system",
        "content": (
            "You are a narrative analysis system. Your task is to extract major narrative elements from the following story text, "
            "returning the output strictly as a JSON object that conforms to the given schema\n\n"
            "**Field Descriptions:**\n"
            f"{field_desc_str}\n\n"
            "**Rules:**\n"
            "- Output must be valid JSON, compatible with the provided schema.\n"
            "- If any field is not derivable, return `None` or `[]` as appropriate.\n"
            "- Do not invent or speculate beyond the text and given context.\n"
            "- Do not include explanations, markdown, or any text outside the JSON.\n"
            "- Limit total response length to under 3000 characters.\n"
            "- Answer in a concise manner.\n"
            "- Stay structured. Do not exceed requested fields or cardinalities.\n"
            "- Use the following information to help inform extraction: " + prompt_addition_str + "\n"
            
)
                },
                {"role": "user", "content": text}
            ],
            model='gemma3:1b-it-qat',
            format=Elements.model_json_schema(),
            options={'temperature': 0}
        )

        tell_me_again_df.at[i, 'extracted_elements'] = response.message.content

        if ((i + 1) % n == 0) or (i + 1 == len(tell_me_again_df)):
            tell_me_again_df.to_excel(output_excel_path, index=False, engine='openpyxl')

    except Exception as e:
        print(f"Error processing index {i}: {e}")
        continue


Processing remaining texts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.04s/text]


In [None]:
from tqdm import tqdm
import pandas as pd
import json
from ollama import chat
from together import Together

n = 1
output_excel_path = 'data/tell_me_again_df_with_elements_v4.xlsx'
client = Together(api_key = "c1ef86a6ea17871ab9c5996b869b6420f862f504a18150761f12bcdb5478b9cd")
#tell_me_again_df = pd.read_excel(output_excel_path, engine='openpyxl')

if 'extracted_elements' not in tell_me_again_df.columns:
    tell_me_again_df['extracted_elements'] = None

texts_to_process = tell_me_again_df[tell_me_again_df['extracted_elements'].isna()]

for i in tqdm(texts_to_process.index, desc="Processing remaining texts", unit="text"):
    text = tell_me_again_df.loc[i, 'unpacked_summary']
    try:
        response =  client.chat.completions.create(
            messages=[
                {
        "role": "system",
        "content": (
            "You are a narrative analysis system. Your task is to extract major narrative elements from the following story text, "
            "returning the output strictly as a JSON object that conforms to the given schema\n\n"
            "**Field Descriptions:**\n"
            f"{field_desc_str}\n\n"
            "**Rules:**\n"
            "- Output must be valid JSON, compatible with the provided schema.\n"
            "- If any field is not derivable, return `None` or `[]` as appropriate.\n"
            "- Do not invent or speculate beyond the text and given context.\n"
            "- Do not include explanations, markdown, or any text outside the JSON.\n"
            "- Limit total response length to under 3000 characters.\n"
            "- Answer in a concise manner.\n"
            "- Stay structured. Do not exceed requested fields or cardinalities.\n"
            "- Use the following information to help inform extraction: " + prompt_addition_str + "\n"
            
)
                },
                {"role": "user", "content": text}
            ],
            model='deepseek-ai/DeepSeek-V3',
            #format=Elements.model_json_schema(),
            #options={'temperature': 0}
        )

        tell_me_again_df.at[i, 'extracted_elements'] = response.message.content

        if ((i + 1) % n == 0) or (i + 1 == len(tell_me_again_df)):
            tell_me_again_df.to_excel(output_excel_path, index=False, engine='openpyxl')

    except Exception as e:
        print(f"Error processing index {i}: {e}")
        continue


In [None]:
tell_me_again_df

In [None]:
str1 = transform_dict_to_string(json.loads(response1.message.content))
str2 = transform_dict_to_string(json.loads(response2.message.content))

emb1 = embedder(str1)
emb2 = embedder(str2)

In [None]:
cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))

In [22]:
texts[23]

"name1, 80-year-old widow of a colonel, lives in a large villa in Auxerre together with the elderly maid name2, who is constantly harassed by the despotic lady. When name2 dies in a fall from a ladder, the elderly name3 is forced to move to Paris to live with her nephew Jean-Pierre. name3, believed by all to be gentle and kind, soon reveals her evil and domineering character and begins to mistreat the grandchildren for no reason, as she did with name2. Jean-Pierre's wife, the nice beautician name5, tries in every way to please her aunt and make her feel comfortable, but the woman repays her with all kinds of spite. Jean-Pierre's sister, the naive name6, is also constantly mocked by her aunt, who thinks she's an idiot.\nDuring the summer, the name7ards, even to momentarily escape the unbearable presence of the aunt, leave for three weeks by going to a holiday village in Greece and leave the woman in the care of a young caregiver, name8. The girl initially tries to please the old woman b