In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)



In [None]:
import spacy

# Load models
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("sentencizer")


def segment_text_by_character(text, characters):
    character_segments = {character: [] for character in characters}

    # cut text to [:10000] because of the model's input limit
    text = text[:100000]

    doc = nlp(text)
    for sent in doc.sents:
        for character in characters:
            if character in sent.text:
                character_segments[character].append(sent.text)
    return character_segments


file = open("./../books/asoif/Book 1 - A Game of Thrones.txt", "r")

text = file.read()

characters = ["Tyrion", "Jon", "Arya", "Sansa", "Bran", "Cersei", "Jaime", "Daenerys", "Robert", "Ned"]

# Example usage
character_segments = segment_text_by_character(text, characters)
print(character_segments)

{'Tyrion': ['The tall boy beside him could only be the crown prince, and that stunted little man behind them was surely the Imp, Tyrion Lannister.\n\n', 'Tyrion Lannister, the youngest of Lord Tywin’s brood and by far the ugliest.', 'All that the gods had given to Cersei and Jaime, they had denied Tyrion.'], 'Jon': ['Robb and Jon sat tall and still on their horses, with Bran between them on his pony, trying to seem older than seven, trying to pretend that he’d seen all this before.', 'Bran’s bastard brother Jon Snow moved closer.', '“Ass,” Jon muttered, low enough so Greyjoy did not hear.', '“You did well,” Jon told him solemnly.', 'Jon was fourteen, an old hand at justice.\n\n', '“No,” Jon Snow said quietly.', 'Jon’s eyes were a grey so dark they seemed almost black, but there was little they did not see.', 'Jon was slender where Robb was muscular, dark where Robb was fair, graceful and quick where his half brother was strong and fast.\n\n', 'Race you to the bridge?”\n\n“Done,” Jon sa

In [None]:
from tqdm import tqdm


def summarize_text(text, model, tokenizer, max_length=512, min_length=100):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


def recursive_summarization(text, model, tokenizer, max_chunk_length=1024, summary_length=512, min_summary_length=100):
    # If text length is within the limit, summarize directly
    inputs = tokenizer(text, return_tensors="pt")
    if inputs.input_ids.shape[1] <= max_chunk_length:
        return summarize_text(text, model, tokenizer, max_length=summary_length, min_length=min_summary_length)

    # Otherwise, split the text into chunks and summarize each chunk
    chunks = []
    while len(text) > 0:
        print(f"Remaining text length: {len(text)}")
        chunk = text[:max_chunk_length]
        last_period = chunk.rfind(".")
        if last_period != -1:
            chunk = text[: last_period + 1]
            text = text[last_period + 1 :]
        else:
            chunk = text[:max_chunk_length]
            text = text[max_chunk_length:]
        chunks.append(chunk)

    # Summarize each chunk
    summaries = [
        summarize_text(chunk, model, tokenizer, max_length=summary_length, min_length=min_summary_length)
        for chunk in tqdm(chunks)
    ]

    # Combine the summaries and recursively summarize the result
    combined_summary = " ".join(summaries)
    return recursive_summarization(
        combined_summary, model, tokenizer, max_chunk_length, summary_length, min_summary_length
    )


# Example usage
long_text = " ".join(character_segments["Jon"])
final_summary = recursive_summarization(long_text, model, tokenizer)
print(final_summary)

Remaining text length: 4717
Remaining text length: 3771
Remaining text length: 2928
Remaining text length: 1908
Remaining text length: 989


100%|██████████| 5/5 [00:42<00:00,  8.48s/it]


Robb and Jon sat tall and still on their horses, with Bran between them on his pony, trying to pretend that he’d seen all this before. Jon was slender where Robb was muscular. Robb was fair, graceful and quick where his half brother was strong and fast. ‘Robb says the man died bravely, but Jon says he was afraid’ “What do you think?” Bran gave the pup a quick nervous stroke, then turned as Jon said, “Here you go.”


In [None]:
import nltk
import spacy
from nltk import ne_chunk, pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize

# Ensure you have downloaded the necessary NLTK data files
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add neuralcoref to spaCy's pipeline
# neuralcoref.add_to_pipe(nlp)


def extract_entities(sent):
    words = word_tokenize(sent)
    pos_tags = pos_tag(words)
    chunked_nes = ne_chunk(pos_tags, binary=False)
    entities = []
    for chunk in chunked_nes:
        if hasattr(chunk, "label") and chunk.label() == "PERSON":
            entities.append(" ".join(c[0] for c in chunk))
    return entities


def extract_relations(doc):
    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text
                verb = token.head.text
                obj = [child for child in token.head.children if child.dep_ == "dobj"]
                if obj:
                    relations.append((subject, verb, obj[0].text))
    return relations


def extract_facts(text):
    sentences = sent_tokenize(text)

    character_entities = []
    for sentence in sentences:
        character_entities.extend(extract_entities(sentence))

    character_entities = list(set(character_entities))

    doc = nlp(text)

    relations = extract_relations(doc)

    characters_facts = {"characters": character_entities, "relations": relations}

    return characters_facts


file = open("./../books/asoif/Book 1 - A Game of Thrones.txt", "r")
text = file.read()[:1000000]

facts = extract_facts(text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janvasiljevic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/janvasiljevic/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/janvasiljevic/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/janvasiljevic/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer, pipeline

nltk.download("punkt")

file = open("./../books/asoif/Book 1 - A Game of Thrones.txt", "r")
text = file.read()[:1000000]

sentences = sent_tokenize(text)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# Create a QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


def answer_question(question, context):
    return qa_pipeline({"question": question, "context": context})


vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(sentences)


def retrieve_relevant_sentences(question, sentences, top_n=15):
    query_vec = vectorizer.transform([question])
    scores = cosine_similarity(query_vec, X).flatten()
    top_indices = np.argsort(scores)[-top_n:][::-1]
    return [sentences[i] for i in top_indices]


questions = ["Where does Jon live?", "Who is Jon's father?", "What is Jon's real name?"]

for question in questions:
    relevant_sentences = retrieve_relevant_sentences(question, sentences)
    answers = []
    for sentence in relevant_sentences:
        answer = answer_question(question, sentence)
        answers.append((answer["answer"], answer["score"], sentence))
    # Select the best answer based on the score
    best_answer = max(answers, key=lambda x: x[1])
    print(f"Question: {question}")
    print(f"Answer: {best_answer[0]}")
    print(f"Context: {best_answer[2]}")
    print(f"Score: {best_answer[1]}")
    print("\n")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/janvasiljevic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: Where does Jon live?
Answer: live.
Context: Let her live.
Score: 0.0536450631916523


Question: Who is Jon's father?
Answer: Father was alone.
Context: Father was alone.
Score: 0.06143534928560257


Question: What is Jon's real name?
Answer: It felt real enough.
Context: It felt real enough.
Score: 0.040221136063337326


