In [5]:
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import numpy as np
import random
from sklearn.preprocessing import normalize
random.seed(42)
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import dspy
from nltk.stem import PorterStemmer
nltk.download('punkt')
from datasets import Dataset
nltk.download('stopwords')
from transformers import pipeline
import ast
import json


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def create_category_string(category, dictionary):
    string = f"{category}: "
    
    if category in dictionary:
        category_data = dictionary[category]
        if isinstance(category_data, list):
            for item in category_data:
                for k, v in item.items():
                    if k == 'name':  
                        continue
                    if isinstance(v, list):
                        v = ', '.join(v)
                    string += f"{k}: {v} "
        elif isinstance(category_data, dict):
            for k, v in category_data.items():
                if k == 'name': 
                    continue
                if isinstance(v, list):
                    v = ', '.join(v)
                string += f"{k}: {v} "
        else:
            string += f"{category}: {category_data} "
    
    return string



def transform_dict_to_string(data):
    full_string = ""

    for category in data:
        full_string += create_category_string(category, data)
        full_string += " "
    
    return full_string.strip()




In [3]:
from transformers import pipeline
from collections import defaultdict
import re

class CharacterAnonymizer:
    def __init__(self):
        self.ner_pipeline = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple")
        self.name_map = {}
        self.clusters = {}

    def normalize_name(self, name):
        """Normalize name: lowercase, remove punctuation."""
        name = name.lower()
        name = re.sub(r"[^a-z ]", "", name) 
        return name.strip()

    def extract_names(self, text):
        """Run NER and extract PERSON names."""
        ner_results = self.ner_pipeline(text)
        names = [ent['word'] for ent in ner_results if ent['entity_group'] == 'PER']
        return list(set(self.normalize_name(name) for name in names))
        
    def cluster_names(self, names):
        """Cluster similar names by shared first tokens."""
        clusters = defaultdict(list)
        used = set()
    
        for name in names:
            if not name.strip():  
                continue
            if name in used:
                continue
            parts = name.split()
            if not parts:
                continue
            key = parts[0]
            for other in names:
                if key in other and other not in used:
                    clusters[key].append(other)
                    used.add(other)
        return clusters

    def generate_name_map(self, clusters):
        """Create CharacterN mapping for each name variant."""
        name_map = {}
        for i, (key, variants) in enumerate(clusters.items(), start=1):
            tag = f"Character{i}"
            for name in variants:
                name_map[name] = tag
        return name_map

    def replace_names(self, text, name_map):
        """Replace all name variants in the original text with CharacterN."""
        for original in sorted(name_map.keys(), key=len, reverse=True):
            pattern = re.compile(rf'\b{re.escape(original)}\b', re.IGNORECASE)
            text = pattern.sub(name_map[original], text)
        return text

    def anonymize(self, text):
        names = self.extract_names(text)
        self.clusters = self.cluster_names(names)
        self.name_map = self.generate_name_map(self.clusters)
        return self.replace_names(text, self.name_map)

In [4]:
def preprocess_text(text):
    """Not used for transformers"""
    
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop_words]
    
    ps = PorterStemmer()
    
    tokens = [ps.stem(token) for token in tokens]
    
    return tokens

In [5]:
tell_me_again_df = pd.read_csv("data/tell_me_again.csv")
tell_me_again_df['label'] = tell_me_again_df.groupby('title').ngroup()


In [6]:
tell_me_again_df = tell_me_again_df[tell_me_again_df['unpacked_summary'].str.len() >= tell_me_again_df['unpacked_summary'].str.len().describe()["75%"]]

In [7]:
tell_me_again_df['property_count'] = tell_me_again_df['label'].map(tell_me_again_df['label'].value_counts())

In [8]:
tell_me_again_df = tell_me_again_df[tell_me_again_df.property_count >= 3]

In [9]:
tell_me_again_df.shape

(8019, 8)

In [10]:
summary_indices = random.sample(list(set(tell_me_again_df.label)),1000)
tell_me_again_df = tell_me_again_df[tell_me_again_df.label.isin(summary_indices)]

In [11]:
ner_pipeline = pipeline("ner", model = "Jean-Baptiste/roberta-large-ner-english",grouped_entities=True)

dataset = Dataset.from_pandas(tell_me_again_df[['unpacked_summary']])

Device set to use cuda:0


In [10]:
tell_me_again_df.unpacked_summary.iloc[1231]

'Leia, a 21-year-old woman, is found by the police after seventeen years of captivity in a cellar. His kidnapper, Benjamin, is immediately thrown into prison. Leia finds her parents, her home, but she has no memory of them or where she lived. Despite the attention of her relatives and the psychological help given to her, Leia is unable to regain her marks. She secretly cuts out all the press clippings about Ben, who has been her only companion for so many years. Paradoxically, she still has feelings for him and fails to bond with her parents.'

In [11]:
labels = tell_me_again_df.label.tolist()
texts = tell_me_again_df.unpacked_summary.tolist()

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
device

device(type='cuda')

In [14]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
embedder = dspy.Embedder(model.encode)

In [15]:
embeddings = embedder(texts)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

def build_index(embeddings, normalize_vectors=True):
    if normalize_vectors:
        return normalize(embeddings, axis=1)
    return embeddings

def get_dense_similarity(query_vector, candidate_vectors):
    return np.dot(candidate_vectors, query_vector.T).flatten()

def get_bm25_scores(corpus, query, bm25=None):
    if bm25 is None:
        bm25 = BM25Okapi([doc for doc in corpus])
    return bm25.get_scores(query)

def get_relevance_scores(i, labels, data, index, k=None, method='dense', bm25=None):
    if k is None:
        k = len(labels)

    query_label = labels[i]
    matching_labels = np.where(np.array(labels) == query_label)[0]

    if method == 'dense':
        query_vector = index[i].reshape(1, -1)
        similarities = get_dense_similarity(query_vector, index)
    elif method == 'bm25':
        query_text = data[i]
        similarities = get_bm25_scores(data, query_text, bm25)
    else:
        raise ValueError("Unsupported method. Use 'dense' or 'bm25'.")

    top_k_indices = np.argsort(similarities)[::-1]
    top_k_indices = top_k_indices[top_k_indices != i][:k]

    relevance_scores = [1 if x in matching_labels else 0 for x in top_k_indices]
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)

    return ideal_relevance_scores, relevance_scores

def calculate_mean_ndcg_score(labels, data, embeddings=None, method='dense'):
    if method == 'dense':
        embeddings = np.array(embeddings).astype('float32')
        index = build_index(embeddings)
        get_index = lambda: index
        bm25 = None
    elif method == 'bm25':
        index = data
        get_index = lambda: index
        bm25 = BM25Okapi([doc for doc in data])
    else:
        raise ValueError("Method must be 'dense' or 'bm25'.")

    ideal_relevance_scores = []
    relevance_scores = []

    for i in [labels.index(x) for x in list(set(labels))]:
        irs, rs = get_relevance_scores(i, labels, data, get_index(), method=method, bm25=bm25)
        ideal_relevance_scores.append(irs)
        relevance_scores.append(rs)

    return ndcg_score(np.array(ideal_relevance_scores), np.array(relevance_scores))


In [16]:
%%time
ndcg_dense = calculate_mean_ndcg_score(labels, data=texts, embeddings=embeddings, method='dense')
ndcg_dense

CPU times: total: 8.75 s
Wall time: 8.62 s


np.float64(0.5915919671919618)

In [17]:
%%time
ndcg_bm25 = calculate_mean_ndcg_score(labels, data=[preprocess_text(text) for text in texts], method='bm25')
ndcg_bm25


KeyboardInterrupt



# 2. Zero-shot Ollama + DSPy

In [12]:
anonymizer = CharacterAnonymizer()
texts = [anonymizer.anonymize(x) for x in dataset['unpacked_summary'][:50]]

Device set to use cuda:0


In [14]:
from ollama import chat
from pydantic import BaseModel, Field
from typing import Literal, Optional, List

class Character(BaseModel):
    name: str # removed in post-processing, helps the model establish seperate identities
    role: Optional[Literal["protagonist", "antagonist","love interest","confidant", "deuteragonist", "tertiary character", "foil"]]
    backstory: Optional[str]
    weaknesses: Optional[list[str]] 
    strengths: Optional[list[str]] 
    motivations: Optional[list[str]]

class Setting(BaseModel):
    time_periods: Optional[list[str]]
    locations: Optional[list[str]]
    cultural_context: Optional[list[str]]

class Plot(BaseModel):
    conflict: Optional[Literal["person vs. person", "person vs. self", "person vs. nature", "person vs.society", "person vs. fate", "person vs. machine", "person vs. the unknown"]]
    basic_plot: Optional[Literal["overcoming the Monster", "rags to Riches", "the Quest", "voyage and Return", "comedy", "tragedy", "rebirth"]]                                                                                                                                                                                                                                                                                
    story_exposition_summary: Optional[str]
    story_rising_action_summary: Optional[str]
    story_climax_summary: Optional[str]
    story_falling_action_summary: Optional[str]
    story_resolution_summary: Optional[str]
    
class Theme(BaseModel):
    themes: Optional[list[str]]
    morals: Optional[list[str]]
    
class Other(BaseModel):
    main_genres: Optional[list[Literal["action", "comedy", "drama", "erotic fiction", "horror", "mystery", "period piece", "romance", "thriller"]]]
    genre_keywords: Optional[list[str]]
    
class Elements(BaseModel):
    setting: Setting
    plot: Plot
    theme: Theme
    other: Other
    characters: list[Character]



In [15]:
prompt_additions = {
    "basic_plot": """- overcoming the Monster: The protagonist sets out to defeat an antagonistic force (often evil) which threatens the protagonist and/or protagonist's homeland
- rags to Riches: The poor protagonist acquires things such as power, wealth, and a mate, before losing it all and gaining it back upon growing as a person.
- the Quest: The protagonist and some companions set out to acquire an important object or to get to a location, facing many obstacles and temptations along the way.
- voyage and Return: The protagonist goes to a strange land and, after overcoming the threats it poses to him or her, returns with nothing but experience.
- comedy: Light and humorous character with a happy or cheerful ending; a dramatic work in which the central motif is the triumph over adverse circumstance, resulting in a successful or happy conclusion. Booker makes sure to stress that comedy is more than humor. It refers to a pattern where the conflict becomes more and more confusing, but is at last made plain in a single clarifying event. Most romances fall into this category.
- tragedy: The protagonist is a hero with one major character flaw or great mistake which is ultimately their undoing. Their unfortunate end evokes pity at their folly and the fall of a fundamentally 'good' character.
- rebirth: During the course of the story, an important event forces the main character to change their ways, often making them a better person.""",

    "role": """- protagonist: The main character of the story is the protagonist. They should be carefully crafted with a logical backstory, personal motivation, and a character arc over the course of the story. Often the story will be told from their point of view.
- antagonist: The villain of the story is the antagonist.
- confidant: This type of character is the best friend or sidekick of the protagonist. Often the protagonist's goal flows through the confidant—although not every story needs one.
- deuteragonist: These characters often overlap with confidants. A deuteragonist is close to the main character, but the story’s main plot does not directly correspond with their own character arc.
- tertiary character: Tertiary characters populate the world of the story but do not necessarily link to the main storyline. These minor characters serve any number of functions and may have varying degrees of personal dynamism.
- foil: A foil character primarily exists to bring the protagonist’s qualities into sharper relief. This is because the foil is effectively the opposite of the protagonist.""",

    "conflict": """- person vs. person: The problem is another character
- person vs. self: The problem lies inside the protagonist
- person vs. nature: The problem comes from non-sapient sources
- person vs. fate: The problem is an undesirable destiny, which may also involve divine will.
- person vs. machine:  As in machinery. Most commonly told from the perspective of a worker being replaced by a machine
- person vs. the unknown: Has also been proposed as the type that codifies Horror, where the enemy is the incomprehensible, otherworldly or extraterrestrial."""
}

In [16]:
prompt_addition_str = ""
for k,v in prompt_additions.items():
    prompt_addition_str += f"{k} information: {v}\n"

In [None]:
%%time
response = chat(
    messages=[
    {"role": "system", "content": f"""Return as a JSON object. Individual elements should return: None or [], if they are not derivable from the text. You can make use of information from"""},
    {"role": "user", "content": texts[1]}
    ],
    model='gemma3:1b-it-qat',
    format=Elements.model_json_schema(),
    options = {'temperature':0}
)
#transform_dict_to_string(json.loads(response.message.content))
elements = Elements.model_validate_json(response.message.content)
print(elements)

In [None]:
from tqdm import tqdm 

extracted_elements = []
for text in tqdm(texts[:10], desc="Processing texts", unit="text"):
    response = chat(
        messages=[
            {"role": "system", "content": f"Return as a JSON object. Individual elements should return: None or [], if they are not derivable from the text. You can make use of information from: {prompt_addition_str}"},
            {"role": "user", "content": text}
        ],
        model='gemma3:1b-it-qat',
        format=Elements.model_json_schema(),
        options = {'temperature': 0}
    )


    extracted_elements.append(transform_dict_to_string(json.loads(response.message.content)))

In [16]:
str1 = transform_dict_to_string(json.loads(response1.message.content))
str2 = transform_dict_to_string(json.loads(response2.message.content))

emb1 = embedder(str1)
emb2 = embedder(str2)

NameError: name 'response1' is not defined

In [None]:
cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))

In [22]:
texts[23]

"name1, 80-year-old widow of a colonel, lives in a large villa in Auxerre together with the elderly maid name2, who is constantly harassed by the despotic lady. When name2 dies in a fall from a ladder, the elderly name3 is forced to move to Paris to live with her nephew Jean-Pierre. name3, believed by all to be gentle and kind, soon reveals her evil and domineering character and begins to mistreat the grandchildren for no reason, as she did with name2. Jean-Pierre's wife, the nice beautician name5, tries in every way to please her aunt and make her feel comfortable, but the woman repays her with all kinds of spite. Jean-Pierre's sister, the naive name6, is also constantly mocked by her aunt, who thinks she's an idiot.\nDuring the summer, the name7ards, even to momentarily escape the unbearable presence of the aunt, leave for three weeks by going to a holiday village in Greece and leave the woman in the care of a young caregiver, name8. The girl initially tries to please the old woman b