In [62]:
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import numpy as np
import random
from sklearn.preprocessing import normalize
random.seed(42)
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import dspy
from nltk.stem import PorterStemmer
nltk.download('punkt')
from datasets import Dataset
nltk.download('stopwords')
from transformers import pipeline
import ast
import json


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def anonymize_with_ner(texts, ner_results):
    new_texts = []
    for text, ents in zip(texts, ner_results):
        name_map = {}
        name_id = 1
        for ent in ents:
            if ent['entity_group'] == 'PER':
                name = ent['word']
                if name not in name_map:
                    name_map[name] = f"name{name_id}"
                    name_id += 1
                text = text.replace(name, name_map[name])
        new_texts.append(text)
    return new_texts

In [3]:
def preprocess_text(text):
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop_words]
    
    ps = PorterStemmer()
    
    tokens = [ps.stem(token) for token in tokens]
    
    return tokens

In [4]:
tell_me_again_df = pd.read_csv("data/tell_me_again.csv")
tell_me_again_df['label'] = tell_me_again_df.groupby('title').ngroup()
tell_me_again_df['property_count'] = tell_me_again_df['label'].map(tell_me_again_df['label'].value_counts())

In [5]:
# only select summaries if there are 2 or more summaries of the same movie
tell_me_again_df = tell_me_again_df[tell_me_again_df.property_count >= 2]

In [6]:
summary_indices = random.sample(list(set(tell_me_again_df.label)),1000)

In [7]:
tell_me_again_df = tell_me_again_df[tell_me_again_df.label.isin(summary_indices)]

In [8]:
ner_pipeline = pipeline("ner", grouped_entities=True)

dataset = Dataset.from_pandas(tell_me_again_df[['unpacked_summary']])

#ner_results = ner_pipeline(dataset['unpacked_summary'], batch_size=16)

#tell_me_again_df['unpacked_summary'] = anonymize_with_ner(dataset['unpacked_summary'], ner_results)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [9]:
tell_me_again_df.unpacked_summary.iloc[1231]

'Leia, a 21-year-old woman, is found by the police after seventeen years of captivity in a cellar. His kidnapper, Benjamin, is immediately thrown into prison. Leia finds her parents, her home, but she has no memory of them or where she lived. Despite the attention of her relatives and the psychological help given to her, Leia is unable to regain her marks. She secretly cuts out all the press clippings about Ben, who has been her only companion for so many years. Paradoxically, she still has feelings for him and fails to bond with her parents.'

In [10]:
labels = tell_me_again_df.label.tolist()
texts = tell_me_again_df.unpacked_summary.tolist()

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
device

device(type='cuda')

In [146]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
embedder = dspy.Embedder(model.encode)

In [66]:
embeddings = embedder(texts)

NameError: name 'texts' is not defined

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

def build_index(embeddings, normalize_vectors=True):
    if normalize_vectors:
        return normalize(embeddings, axis=1)
    return embeddings

def get_dense_similarity(query_vector, candidate_vectors):
    return np.dot(candidate_vectors, query_vector.T).flatten()

def get_bm25_scores(corpus, query, bm25=None):
    if bm25 is None:
        bm25 = BM25Okapi([doc for doc in corpus])
    return bm25.get_scores(query)

def get_relevance_scores(i, labels, data, index, k=None, method='dense', bm25=None):
    if k is None:
        k = len(labels)

    query_label = labels[i]
    matching_labels = np.where(np.array(labels) == query_label)[0]

    if method == 'dense':
        query_vector = index[i].reshape(1, -1)
        similarities = get_dense_similarity(query_vector, index)
    elif method == 'bm25':
        query_text = data[i]
        similarities = get_bm25_scores(data, query_text, bm25)
    else:
        raise ValueError("Unsupported method. Use 'dense' or 'bm25'.")

    top_k_indices = np.argsort(similarities)[::-1]
    top_k_indices = top_k_indices[top_k_indices != i][:k]

    relevance_scores = [1 if x in matching_labels else 0 for x in top_k_indices]
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)

    return ideal_relevance_scores, relevance_scores

def calculate_mean_ndcg_score(labels, data, embeddings=None, method='dense'):
    if method == 'dense':
        embeddings = np.array(embeddings).astype('float32')
        index = build_index(embeddings)
        get_index = lambda: index
        bm25 = None
    elif method == 'bm25':
        index = data
        get_index = lambda: index
        bm25 = BM25Okapi([doc for doc in data])
    else:
        raise ValueError("Method must be 'dense' or 'bm25'.")

    ideal_relevance_scores = []
    relevance_scores = []

    for i in [labels.index(x) for x in list(set(labels))]:
        irs, rs = get_relevance_scores(i, labels, data, get_index(), method=method, bm25=bm25)
        ideal_relevance_scores.append(irs)
        relevance_scores.append(rs)

    return ndcg_score(np.array(ideal_relevance_scores), np.array(relevance_scores))


In [16]:
%%time
ndcg_dense = calculate_mean_ndcg_score(labels, data=texts, embeddings=embeddings, method='dense')
ndcg_dense

CPU times: total: 8.75 s
Wall time: 8.62 s


np.float64(0.5915919671919618)

In [17]:
%%time
ndcg_bm25 = calculate_mean_ndcg_score(labels, data=[preprocess_text(text) for text in texts], method='bm25')
ndcg_bm25


KeyboardInterrupt



# 2. Zero-shot Ollama + DSPy

In [160]:
text = anonymize_with_ner(dataset['unpacked_summary'][:5], ner_pipeline(dataset['unpacked_summary'][:5], batch_size=1))[3]

In [161]:
from ollama import chat
from pydantic import BaseModel, Field
from typing import Literal, Optional, List

class MajorCharacter(BaseModel):
    name: str
    role: Optional[str] = Field(None, description="The role of the character, None if not derivable.")
    age: Optional[int] = Field(None, description="The age of the character, None if not derivable.")
    backstory: Optional[str] = Field(None, description="The backstory of the character, None if not derivable.")
    beliefs: Optional[list[str]]
    weaknesses: Optional[list[str]]
    strengths: Optional[list[str]]
    motivations: Optional[list[str]]
    
    #character_arc: Optional[str] = Field(None, description="Describes how the character changes over the story")
    #relationship: Optional[str]
    #actantial_role: Literal['Subject', 'Object', 'Helper', 'Opponent', 'Sender', 'Receiver'] = None

class Setting(BaseModel):
    time_periods: list[str]
    locations: list[str] 
    cultural_context: list[str]

class Plot(BaseModel):

    #timeline_of_events: list[str] = Field(description = (
    #    "a sequenced list of very brief event descriptions"
    #))SS
    
    conflict: Optional[Literal["person vs. person", "person vs. self", "person vs. nature", "person vs.society", "person vs. fate", "person vs. supernatural", "person vs. technology"]]
    basic_plot: Optional[Literal["Overcoming the Monster", "Rags to Riches", "The Quest", "Voyage and Return", "Comedy", "Tragedy", "Rebirth"]] = Field(None, description="Select one of the following basic plot types:"
    "- Overcoming the Monster: The protagonist sets out to defeat an antagonistic force (often evil) which threatens the protagonist and/or protagonist's homeland"
    "- Rags to Riches: The poor protagonist acquires things such as power, wealth, and a mate, before losing it all and gaining it back upon growing as a person."  
    "- The Quest: The protagonist and some companions set out to acquire an important object or to get to a location, facing many obstacles and temptations along the way."
    "- Voyage and Return: The protagonist goes to a strange land and, after overcoming the threats it poses to him or her, returns with nothing but experience."  
    "- Comedy: Light and humorous character with a happy or cheerful ending; a dramatic work in which the central motif is the triumph over adverse circumstance, resulting in a successful or happy conclusion.Booker makes sure to stress that comedy is more than humor. It refers to a pattern where the conflict becomes more and more confusing, but is at last made plain in a single clarifying event. Most romances fall into this category."                                                                                                                                                    
    "- Tragedy: The protagonist is a hero with one major character flaw or great mistake which is ultimately their undoing. Their unfortunate end evokes pity at their folly and the fall of a fundamentally 'good' character."
    "- Rebirth: During the course of the story, an important event forces the main character to change their ways, often making them a better person. "                                                                                                                                                   
                                                                                                                                                       )
    story_exposition: Optional[str]
    story_rising_action: Optional[str]
    story_climax: Optional[str]
    story_falling_action: Optional[str]
    story_resolution: Optional[str]
    
class Theme(BaseModel):
    main_themes: Optional[list[str]]
    secondary_themes: Optional[list[str]]
    morals: Optional[list[str]]
    
class Other(BaseModel):
    main_genres: Optional[list[str]]
    sub_genres: Optional[list[str]]
    content_warnings: Optional[list[str]]
    #writing_style: Optional[str]
    #point_of_view: Optional[Literal['First-person narrator', 'Third-person limited narrator', 'Third-person omniscient narrator']]
    
class Elements(BaseModel):
    major_characters: list[MajorCharacter]
    setting: Setting
    plot: Plot
    theme: Theme
    other: Other
    


In [162]:
#36.5

In [163]:
%%time
response1 = chat(
    messages=[
    {"role": "system", "content": f"Return as a JSON object. If features are not derivable, use None for that feature."},
    {"role": "user", "content": text}
    ],
    model='gemma3:1b-it-qat',
    format=Elements.model_json_schema(),
    #stream = True,
    options = {'temperature':0}
)

elements = Elements.model_validate_json(response1.message.content)
print(elements)

CPU times: total: 0 ns
Wall time: 5.65 s


In [130]:
%%time
response2 = chat(
    messages=[
    {"role": "system", "content": f"Return as a JSON object"},
    {"role": "user", "content": text}
    ],
    model='gemma3:4b-it-qat',
    format=Elements.model_json_schema(),
    options = {'temperature':0}
)

elements2 = Elements.model_validate_json(response2.message.content)
print(elements2)

CPU times: total: 46.9 ms
Wall time: 2min 33s


In [131]:
def create_category_string(category, dictionary):
    string = f"{category}: "
    
    if category in dictionary:
        category_data = dictionary[category]
        if isinstance(category_data, list):
            for item in category_data:
                for k, v in item.items():
                    if isinstance(v, list):
                        v = ', '.join(v)
                    string += f"{k}: {v} "
        elif isinstance(category_data, dict):
            for k, v in category_data.items():
                if isinstance(v, list):
                    v = ', '.join(v)
                string += f"{k}: {v} "
        else:
            string += f"{category}: {category_data} "
    
    return string


def transform_dict_to_string(data):
    full_string = ""

    for category in data:
        full_string += create_category_string(category, data)
        full_string += " "
    
    return full_string.strip()




In [164]:
str1 = transform_dict_to_string(json.loads(response1.message.content))
str2 = transform_dict_to_string(json.loads(response2.message.content))

emb1 = embedder(str1)
emb2 = embedder(str2)

In [165]:
cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))

array([[0.6023717]], dtype=float32)

In [149]:
text

"name1 grows up as an Irish orphan in Cleveland, which is ruled by mob boss name2. Although he is bad at school, he knows how to assert himself. Again and again, his friends come to his aid. So he's working with them early in the docks. And after securing the post of union president by not entirely legal means, he marries the beautiful name3. Life seems to be getting better all the time, because not only is the family growing, but his reputation is rising, and business with the Mafia is getting better all the time. Unfortunately, journalist name4 collects enough evidence against name5 to go to name6 and have him arrested. But before name5 goes to prison, he makes a deal with the FBI to report on the business of organized crime at least once a month.\nAnd even though name5's back in business as a loan shark's debt collector, he doesn't have much to report to the FBI. However, he and his wife have to move to an inferior neighborhood, adding to the strain on their marriage. And while the 

In [166]:
text

"name1 (name2) returns to his small California hometown after seventeen years in prison. When he arrives at his mother name3's (name4) home, he discovers a man in the house and beats him up, thinking it was a break-in. But he turns out to be name3's nurse, who has incurable cancer. Meanwhile, young name5 (name6) steals drugs from the hospital where she works to sell to drug addicts. name7 needs her to buy a wheelchair. The meeting goes badly, with name7's return not being welcomed by the name8 family."