<a href="https://colab.research.google.com/github/Nuette/SceneDetection/blob/main/RuleBasedSceneDetection_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy coreferee
!python -m spacy download en_core_web_lg
!python -m coreferee install en

Collecting coreferee
  Downloading coreferee-1.4.1-py3-none-any.whl.metadata (2.5 kB)
Collecting spacy
  Downloading spacy-3.5.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting thinc<8.2.0,>=8.1.8 (from spacy)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting typer<0.10.0,>=0.3.0 (from spacy)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Collecting pathy>=0.10.0 (from spacy)
  Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)
Collecting smart-open<7.0.0,>=5.2.1 (from spacy)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 (from spacy)
  Downloading pydantic-1.10.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.2/152.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting pathlib-abc==0.1.1 (from pa

In [2]:
import spacy
import coreferee

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('coreferee')


<coreferee.manager.CorefereeBroker at 0x7e8a3f79f280>

In [3]:
file_path = 'The_adventures_of_the_Italian_nobleman.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


Function to detect characters using coreferee

In [4]:
def get_character_mentions(doc):
    character_mentions = {}
    for chain in doc._.coref_chains.chains:
        main_mention = chain[0]
        main_mention_span = doc[main_mention.token_indexes[0]:main_mention.token_indexes[-1] + 1]

        if main_mention_span.root.pos_ == "PROPN":
            mentions = []
            for mention in chain:
                mention_span = doc[mention.token_indexes[0]:mention.token_indexes[-1] + 1]
                mentions.append(mention_span.text)
            character_mentions[main_mention_span.text] = mentions
    return character_mentions

Function to detect place

In [5]:
def get_place_mentions(doc):
    places = set(ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"])
    return places

Function to identify time mentions

In [6]:
def get_time_mentions(doc):
    times = set(ent.text for ent in doc.ents if ent.label_ in ["DATE", "TIME"])
    return times

Function to detect summaries or scenic descriptions

In [7]:
def is_summary_or_description(doc):
    adjectives = [token for token in doc if token.pos_ == "ADJ"]
    action_verbs = [token for token in doc if token.pos_ == "VERB" and token.tag_ in ["VBD", "VB"]]

    # Heuristic: If there are more adjectives than action verbs, it's likely a description
    return len(adjectives) > len(action_verbs) * 2

Replace coreferent mentions only at the token level to avoid replacing parts of words


In [8]:
# Function to replace coreferent mentions using coreferee
def replace_coreferences(doc):
    coref_resolved_text = []
    for token in doc:
        # Check if the token is part of a coreference chain
        if token._.coref_chains:
            # Find the first coreference chain this token belongs to
            for chain in doc._.coref_chains.chains:
                if token.i in chain:
                    # Replace with the main mention of the chain
                    main_entity = doc[chain[0].token_indexes[0]:chain[0].token_indexes[-1] + 1]
                    coref_resolved_text.append(main_entity.text)
                    break
        else:
            coref_resolved_text.append(token.text)
    return " ".join(coref_resolved_text)

# Apply the spaCy pipeline and segment text using a sliding window



In [9]:
window_size = 5

previous_characters = set()
previous_time = set()
previous_place = set()

character_memory = set()
character_memory_window_size = 3  # Number of windows to retain characters. Experiment with this untill a good output is given.
segments = []
current_segment = []
current_type = 'scene'

sentences = [sent.text.strip() for sent in nlp(text).sents]
i = 0
while i < len(sentences):
    window = sentences[i:i + window_size]
    window_text = " ".join(window)
    window_doc = nlp(window_text)

    resolved_window_text = replace_coreferences(window_doc)
    window_doc_resolved = nlp(resolved_window_text)

    # Extract characters, time, and places
    window_characters = {ent.text for ent in window_doc_resolved.ents if ent.label_ == "PERSON"}
    window_times = get_time_mentions(window_doc_resolved)
    window_places = get_place_mentions(window_doc_resolved)

    print(f"Times detected in Window {i // window_size + 1}: {window_times}")
    print(f"Places detected in Window {i // window_size + 1}: {window_places}")

    # Check if the current window contains a non-scene
    summary_detected = is_summary_or_description(window_doc_resolved)

    if summary_detected:
        print(f"Non-Scene (Summary/Description) detected in Window {i // window_size + 1}")

    character_memory.update(window_characters)
    if len(character_memory) > character_memory_window_size:
        character_memory = set(list(character_memory)[-character_memory_window_size:])

    # Check for scene boundary or non-scene detection
    if (not window_characters.issubset(character_memory) or
        window_times != previous_time or
        window_places != previous_place or
        summary_detected):

        if current_segment:
            segments.append({
                'text': " ".join(current_segment),
                'type': current_type
            })
            current_segment = []

        # If the current window is a non-scene
        if summary_detected:
            current_type = 'non-scene'
        else:
            current_type = 'scene'

        # Clear character memory if it's a completely new scene
        character_memory.clear()

    # Add the current window text to the current segment
    current_segment.append(resolved_window_text)

    # Update values
    previous_characters = window_characters
    previous_time = window_times
    previous_place = window_places

    i += window_size

if current_segment:
    segments.append({
        'text': " ".join(current_segment),
        'type': current_type
    })


Times detected in Window 1: {'one particular evening', 'early June'}
Places detected in Window 1: set()
Non-Scene (Summary/Description) detected in Window 1
Times detected in Window 2: set()
Places detected in Window 2: set()
Times detected in Window 3: set()
Places detected in Window 3: set()
Times detected in Window 4: set()
Places detected in Window 4: set()
Times detected in Window 5: set()
Places detected in Window 5: set()
Times detected in Window 6: {'a few weeks ago'}
Places detected in Window 6: set()
Times detected in Window 7: set()
Places detected in Window 7: set()
Times detected in Window 8: set()
Places detected in Window 8: {'St. John ’s'}
Times detected in Window 9: set()
Places detected in Window 9: set()
Times detected in Window 10: {'about half an hour ago'}
Places detected in Window 10: set()
Times detected in Window 11: set()
Places detected in Window 11: set()
Times detected in Window 12: set()
Places detected in Window 12: set()
Times detected in Window 13: set(

# Display the segmented scenes and non-scenes



In [10]:
# Display the segmented scenes and non-scenes
for i, segment in enumerate(segments, 1):
    segment_type = segment['type']
    print(f"\nSegment {i} ({segment_type.capitalize()}):\n{segment['text']}\n")


Segment 1 (Non-scene):
and I had many friends and acquaintances of an informal nature . Amongst these was to be numbered Dr. Hawker , a near neighbour of ours , and a member of the medical profession . It was the genial doctor ’s habit to drop in sometimes of an evening and have a chat with , of whose genius was an ardent admirer . The , frank and unsuspicious to the last degree , admired the talents so far removed from own . On one particular evening in early June , arrived about half - past eight and settled down to a comfortable discussion on the cheery topic of the prevalence of arsenical poisoning in crimes .


Segment 2 (Scene):
It must have been about a quarter of an hour later when the door of our sitting - room flew open , and a distracted precipitated into the room . “ Oh , doctor , you ’re wanted ! Such a terrible . gave me a turn , did indeed . ” I recognized in our new visitor Dr. Hawker ’s housekeeper , Miss Rider . The doctor was a bachelor , and lived in a gloomy old h