In [1]:
import spacy
import torch
import dspy
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import numpy as np
import random
import faiss
from sklearn.preprocessing import normalize
random.seed(42)
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop_words]
    
    ps = PorterStemmer()
    
    tokens = [ps.stem(token) for token in tokens]
    
    return tokens

In [3]:
nlp = spacy.load('en_core_web_sm')

def replace_names_with_placeholder(text):
    doc = nlp(text)
    processed_text = []

    for token in doc:
        if token.ent_type_ == "PERSON": 
            processed_text.append("name")  
        elif token.ent_type_ == "NORP":
            processed_text.append("group") 
        elif token.ent_type_ == "ORG":
            processed_text.append("organisation") 
        else:
            processed_text.append(token.text)

    return ' '.join(processed_text)


In [4]:
tell_me_again_df = pd.read_csv("data/tell_me_again.csv")
tell_me_again_df['label'] = tell_me_again_df.groupby('title').ngroup()
tell_me_again_df['property_count'] = tell_me_again_df['label'].map(tell_me_again_df['label'].value_counts())

In [5]:
# only select summaries if there are 2 or more summaries of the same movie
tell_me_again_df = tell_me_again_df[tell_me_again_df.property_count >= 2]

In [6]:
summary_indices = random.sample(list(set(tell_me_again_df.label)),1000)

In [7]:
tell_me_again_df = tell_me_again_df[tell_me_again_df.label.isin(summary_indices)]
#tell_me_again_df['processed_summary'] = tell_me_again_df['unpacked_summary'].apply(replace_names_with_placeholder)
tell_me_again_df

Unnamed: 0.1,Unnamed: 0,wikidata_id,title,language,unpacked_summary,unpacked_summary_sents,label,property_count
48,48,Q100265988,Black Box,de,"In a car accident, young mother Rachel is kill...","['In a car accident, young mother Rachel is ki...",2800,5
49,49,Q100265988,Black Box,it,Nolan is a man who suffers from severe amnesia...,['Nolan is a man who suffers from severe amnes...,2800,5
61,61,Q100349419,Home,de,Marvin Hacks is riding his skateboard on the h...,['Marvin Hacks is riding his skateboard on the...,9026,9
62,62,Q100349419,Home,fr,Marvin Hacks (Jake McLaughlin) returns to his ...,['Marvin Hacks (Jake McLaughlin) returns to hi...,9026,9
77,77,Q1004440,Kill the Irishman,de,Danny Greene grows up as an Irish orphan in Cl...,['Danny Greene grows up as an Irish orphan in ...,10665,4
...,...,...,...,...,...,...,...,...
83126,83126,Q992813,"Ride, Vaquero!",de,"After the Civil War, American farmers settled ...","['After the Civil War, American farmers settle...",16105,3
83127,83127,Q992813,"Ride, Vaquero!",fr,"In Texas, just after the Civil War, a bandit n...","['In Texas, just after the Civil War, a bandit...",16105,3
83128,83128,Q992813,"Ride, Vaquero!",it,Mexican bandit José Esqueda and his gang conti...,['Mexican bandit José Esqueda and his gang con...,16105,3
83228,83229,Q998048,Trimurti,de,Satyadevi is an honest policewoman and mother ...,['Satyadevi is an honest policewoman and mothe...,25450,2


In [8]:
tell_me_again_df.unpacked_summary.iloc[1]

'Nolan is a man who suffers from severe amnesia due to a terrible car accident in which his wife lost her life. Despite severe memory problems, he continues to live with his baby daughter Ava, aided only by his best friend Gary, an orthopedic doctor. The man finds himself completely managed by his child and loses his job as a photojournalist because his shots are no longer up to his old works. Nolan also develops some characteristics that he never had in the past: he forgets to pick up his daughter from school and has small bouts of violence in which he loses control. He also begins to resent some of the rituals of his family routine that are dear to his daughter. In an attempt to regain his memory, Nolan, after several unsuccessful attempts with traditional medicine, begins experimental therapy with Dr. Brooks, who has developed a special hypnosis strategy that allows him to relive his most important memories.\nNolan\'s visions, however, are different from what he expected: the man se

In [9]:
labels = tell_me_again_df.label.tolist()
texts = tell_me_again_df.unpacked_summary.tolist()

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
embedder = dspy.Embedder(model.encode)

In [12]:
embeddings = embedder(texts)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

def build_index(embeddings, normalize_vectors=True):
    if normalize_vectors:
        return normalize(embeddings, axis=1)
    return embeddings

def get_dense_similarity(query_vector, candidate_vectors):
    return np.dot(candidate_vectors, query_vector.T).flatten()

def get_bm25_scores(corpus, query, bm25=None):
    if bm25 is None:
        bm25 = BM25Okapi([doc for doc in corpus])
    return bm25.get_scores(query)

def get_relevance_scores(i, labels, data, index, k=None, method='dense', bm25=None):
    if k is None:
        k = len(labels)

    query_label = labels[i]
    matching_labels = np.where(np.array(labels) == query_label)[0]

    if method == 'dense':
        query_vector = index[i].reshape(1, -1)
        similarities = get_dense_similarity(query_vector, index)
    elif method == 'bm25':
        query_text = data[i]
        similarities = get_bm25_scores(data, query_text, bm25)
    else:
        raise ValueError("Unsupported method. Use 'dense' or 'bm25'.")

    top_k_indices = np.argsort(similarities)[::-1]
    top_k_indices = top_k_indices[top_k_indices != i][:k]

    relevance_scores = [1 if x in matching_labels else 0 for x in top_k_indices]
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)

    return ideal_relevance_scores, relevance_scores

def calculate_mean_ndcg_score(labels, data, embeddings=None, method='dense'):
    if method == 'dense':
        embeddings = np.array(embeddings).astype('float32')
        index = build_index(embeddings)
        get_index = lambda: index
        bm25 = None
    elif method == 'bm25':
        index = data
        get_index = lambda: index
        bm25 = BM25Okapi([doc for doc in data])
    else:
        raise ValueError("Method must be 'dense' or 'bm25'.")

    ideal_relevance_scores = []
    relevance_scores = []

    for i in [labels.index(x) for x in list(set(labels))]:
        irs, rs = get_relevance_scores(i, labels, data, get_index(), method=method, bm25=bm25)
        ideal_relevance_scores.append(irs)
        relevance_scores.append(rs)

    return ndcg_score(np.array(ideal_relevance_scores), np.array(relevance_scores))


In [14]:
ndcg_dense = calculate_mean_ndcg_score(labels, data=texts, embeddings=embeddings, method='dense')
ndcg_dense

0.8595025489669603

In [15]:
%%time
ndcg_bm25 = calculate_mean_ndcg_score(labels, data=[preprocess_text(text) for text in texts], method='bm25')
ndcg_bm25

CPU times: total: 2min 10s
Wall time: 2min 10s


0.8567717232009066

# 2. Zero-shot Ollama + DSPy

In [23]:
text = tell_me_again_df.unpacked_summary.iloc[23]

In [20]:
lm = dspy.LM('ollama_chat/gemma3:4b', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [21]:
from ollama import chat
from pydantic import BaseModel, Field
from typing import Literal, Optional, List

class Character(BaseModel):
    name: str
    role: Optional[str]
    age: Optional[int]
    backstory: Optional[str]
    beliefs: Optional[list[str]]
    weaknesses: Optional[list[str]]
    strengths: Optional[list[str]]
    motivations: Optional[list[str]]
    character_arc: Optional[str] = Field(None, description="Describes how the character changes over the story")
    relationship: Optional[str]
    #actantial_role: Literal['Subject', 'Object', 'Helper', 'Opponent', 'Sender', 'Receiver'] = None

class Setting(BaseModel):
    time_periods: list[str]
    locations: list[str] 
    cultural_context: list[str]

class Plot(BaseModel):

    #timeline_of_events: list[str] = Field(description = (
    #    "a sequenced list of very brief event descriptions"
    #))SS
    
    conflict: Optional[Literal['person vs. person', 'person vs. self', 'person vs. nature', 'person vs.society']]

    story_exposition: Optional[str]
    story_rising_action: Optional[str]
    story_climax: Optional[str]
    story_falling_action: Optional[str]
    story_resolution: Optional[str]
    
class Theme(BaseModel):
    main_themes: Optional[list[str]]
    secondary_themes: Optional[list[str]]
    morals: Optional[list[str]]
    
class Other(BaseModel):
    main_genres: Optional[list[str]]
    sub_genres: Optional[list[str]]
    content_warnings: Optional[list[str]]
    #writing_style: Optional[str]
    #point_of_view: Optional[Literal['First-person narrator', 'Third-person limited narrator', 'Third-person omniscient narrator']]
    
class Elements(BaseModel):
    characters: list[Character]
    setting: Setting
    plot: Plot
    theme: Theme
    other: Other
    


In [24]:
%%time
response = chat(
    messages=[
    {"role": "system", "content": f"Return as a JSON object"},
    {"role": "user", "content": text}
    ],
    model='gemma3:4b',
    format=Elements.model_json_schema(),
    options = {'temperature':0}
)

elements = Elements.model_validate_json(response.message.content)
print(elements)

CPU times: total: 31.2 ms
Wall time: 14.4 s


In [25]:
text

"Angelina Billard, 80-year-old widow of a colonel, lives in a large villa in Auxerre together with the elderly maid Odile, who is constantly harassed by the despotic lady. When Odile dies in a fall from a ladder, the elderly Angelina is forced to move to Paris to live with her nephew Jean-Pierre. Angelina, believed by all to be gentle and kind, soon reveals her evil and domineering character and begins to mistreat the grandchildren for no reason, as she did with Odile. Jean-Pierre's wife, the nice beautician Catherine, tries in every way to please her aunt and make her feel comfortable, but the woman repays her with all kinds of spite. Jean-Pierre's sister, the naive Jeanne, is also constantly mocked by her aunt, who thinks she's an idiot.\nDuring the summer, the Billards, even to momentarily escape the unbearable presence of the aunt, leave for three weeks by going to a holiday village in Greece and leave the woman in the care of a young caregiver, Sandrine. The girl initially tries t