In [1]:
import random
import spacy
import gensim
from gensim.models.phrases import Phrases, Phraser

from gensim import models, corpora
from gensim import similarities

from collections import Counter


In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
balzac_text = open("documents/HonoreDeBalzac.txt").read()

balzac_doc = nlp(balzac_text)

# Pre-processing function for SpaCy docs
The preprocess_and_filter function is designed to enhance the pre-processing of SpaCy documents by combining lemmatization and filtering based on linguistic characteristics. It removes stop words, punctuation, and non-alphabetic tokens while maintaining only those classified as nouns, verbs, or adjectives. This function is particularly useful for refining textual data in a way that retains essential linguistic information, ensuring that only meaningful words contribute to subsequent analyses. By employing this function on SpaCy documents, users can achieve cleaner and more focused text representations, conducive to various natural language processing tasks.

In [4]:
def preprocess_and_filter(doc):
    # Perform pre-processing and filtering tasks
    processed_text = " ".join(token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc if
                              not token.is_stop and
                              not token.is_space and
                              token.pos_ in ['NOUN', 'VERB', 'ADJ', 'PROPN'])
    return processed_text


processed_balzac = preprocess_and_filter(balzac_doc)

In [5]:
print("Processed Balzac Text:", processed_balzac)



# Create a word dictionary
* A corpora.Dictionary is created to represent the vocabulary of the processed documents.

In [6]:
tokenized_text = processed_balzac.split()

word_frequencies = Counter(tokenized_text)

# Create a list of documents (each document is a list of words)
documents = [[word] for word in tokenized_text]

# Create a dictionary from the documents
dictionary = corpora.Dictionary(documents)

for word, idx in dictionary.token2id.items():
    freq = word_frequencies[word]
    print(f"Word: {word}, Index: {idx}, Frequency: {freq} occurrences")


Word: inner, Index: 0, Frequency: 1 occurrences
Word: self, Index: 1, Frequency: 3 occurrences
Word: phenomenon, Index: 2, Frequency: 1 occurrences
Word: vision, Index: 3, Frequency: 2 occurrences
Word: locomotion, Index: 4, Frequency: 1 occurrences
Word: know, Index: 5, Frequency: 13 occurrences
Word: time, Index: 6, Frequency: 17 occurrences
Word: abolish, Index: 7, Frequency: 1 occurrences
Word: Space, Index: 8, Frequency: 1 occurrences
Word: mode, Index: 9, Frequency: 1 occurrences
Word: Time, Index: 10, Frequency: 1 occurrences
Word: intellectual, Index: 11, Frequency: 1 occurrences
Word: physical, Index: 12, Frequency: 1 occurrences
Word: --HISTORY, Index: 13, Frequency: 1 occurrences
Word: LOUIS, Index: 14, Frequency: 1 occurrences
Word: LAMBERT, Index: 15, Frequency: 1 occurrences
Word: November, Index: 16, Frequency: 1 occurrences
Word: evening, Index: 17, Frequency: 9 occurrences
Word: year, Index: 18, Frequency: 3 occurrences
Word: principal, Index: 19, Frequency: 3 occurren

In [7]:
def sentence_weighting(sentence, dictionary):
    # Tokenize the sentence using SpaCy
    tokens = sentence.split()

    # Calculate the total weight of the sentence based on word weights
    sentence_weight = sum(dictionary.token2id.get(token, 0) for token in tokens)

    return sentence, sentence_weight


sentences = processed_balzac.split('.')

# Calculate sentence weights 
sentence_weights = [sentence_weighting(sentence, dictionary) for sentence in sentences if sentence.strip()]

for sentence, weight in sentence_weights:
    print(f"Sentence: {sentence}\nWeight: {weight}\n")



Sentence: inner self phenomenon vision locomotion know time abolish Space mode Time intellectual physical --HISTORY LOUIS LAMBERT November evening year principal citizen Carentan assemble Mme 
Weight: 276

Sentence:  de Dey drawing room Mme 
Weight: 129

Sentence:  de Dey hold reception _ night week unwonted interest attach evening gathering owe certain circumstance pass unnoticed great city small country town excite great curiosity day Mme de Dey home visitor previous evening door shut ground indisposition event ordinary time produce Carentan sensation Paris know night performance theater existence sort incomplete time indiscretion aristocrat matter life death conduct Mme 
Weight: 2590

Sentence:  de Dey likely bring disastrous consequence position Carentan clear reader appreciate expression keen curiosity cunning fanaticism countenance norman citizen importance lady play day Revolution pass crisis difficult moment sympathy reader fill coloring picture Mme de Dey widow Lieutenant Gene

# Sentence Importance

In [8]:
def sentence_weighting(sentence, dictionary):
    # Tokenize the sentence using spaCy
    tokens = nlp(sentence)

    # Calculate the total weight of the sentence based on word weights
    sentence_weight = sum(dictionary.token2id.get(token.lemma_, 0) for token in tokens)

    return sentence, sentence_weight


sentence_weights = [sentence_weighting(sent.text, dictionary) for sent in balzac_doc.sents]

In [9]:
for sentence, weight in sentence_weights:
    print(f"Sentence: {sentence}\nWeight: {weight}\n")


Sentence: [The inner self] ... by a phenomenon of vision or of locomotion has been
known at times to abolish Space in its two modes of Time and Distance--
the one intellectual, the other physical.

Weight: 102

Sentence: --HISTORY OF LOUIS LAMBERT.

Weight: 66

Sentence: On a November evening in the year 1793 the principal citizens of
Carentan were assembled in Mme. de Dey's drawing-room.
Weight: 677

Sentence: Mme. de Dey
held this _reception_ every night of the week, but an unwonted interest
attached to this evening's gathering, owing to certain circumstances
which would have passed altogether unnoticed in a great city, though in a
small country town they excited the greatest curiosity.
Weight: 1032

Sentence: For two days
before Mme. de Dey had not been at home to her visitors, and on the
previous evening her door had been shut, on the ground of indisposition.

Weight: 567

Sentence: Two such events at any ordinary time would have produced in Carentan
the same sensation that Paris k

# Ngrams
* To extract the most used bi-grams and tri-grams from our sentences we will use the gensim.models.phrases.Phrases class. This class takes a list of sentences as input and returns a list of sentences, with common bigrams and trigrams merged into single tokens.

In [10]:
# Tokenize sentences in the original Balzac document
sentences = [list(map(str, sent)) for sent in balzac_doc.sents]

# Train the Phrases model to detect bi-grams and tri-grams
bigram_model = Phrases(sentences, min_count=5, threshold=10)
trigram_model = Phrases(bigram_model[sentences], min_count=5, threshold=10)

# Create a Phraser object for efficient n-gram transformation
bigram_phraser = Phraser(bigram_model)
trigram_phraser = Phraser(trigram_model)

# Apply bi-gram and tri-gram transformation to sentences
bigram_sentences = bigram_phraser[sentences]
trigram_sentences = trigram_phraser[bigram_phraser[sentences]]

print("Most used bi-grams:", bigram_model.export_phrases())
print("Most used tri-grams:", trigram_model.export_phrases())


Most used bi-grams: {'Mme_.': 22.42105263157895, '._de': 21.798245614035086, 'de_Dey': 166.43364197530863, "Dey_'s": 23.193333333333335, 'would_have': 24.15972222222222, 'had_been': 19.274238227146814, ';_but': 14.056565656565656, 'she_had': 18.557965860597438, "mother_'s": 11.596666666666668, 'her_son': 15.17557251908397, "Countess_'s": 11.596666666666668, '"_I': 10.269028041324058, 'I_shall': 32.064516129032256, 'old_merchant': 74.41711229946523, '?_"': 13.992961287078932, '"_cried': 16.241830065359476}
Most used tri-grams: {'would_have': 58.36666666666667, 'had_been': 23.823129251700678, ';_but': 23.529675251959684, 'she_had': 39.70521541950114, "mother_'s": 31.54954954954955, 'her_son': 45.927868852459014, "Countess_'s": 22.27027027027027, '"_I': 25.116803278688526, 'old_merchant': 350.20000000000005, '?_"': 57.40983606557377, '"_cried': 71.76229508196721}


In [11]:
def summarizer_function(text, num_lines=None, percentage=None):
    sentence_weights = [sentence_weighting(sent.text, dictionary) for sent in nlp(text).sents]

    # Sort sentences by weight in descending order
    sorted_sentences = sorted(sentence_weights, key=lambda x: x[1], reverse=True)

    # Select a subset of sentences based on the specified criteria
    if num_lines:
        summary_sentences = sorted_sentences[:num_lines]
    elif percentage:
        summary_size = int(len(sorted_sentences) * (percentage / 100))
        summary_sentences = sorted_sentences[:summary_size]
    else:
        raise ValueError("Specify either 'num_lines' or 'percentage' parameter.")

    # Sort the selected sentences based on their order in the original text
    final_summary = sorted(summary_sentences, key=lambda x: text.find(x[0]))

    # Extract the summary text
    summary_text = "\n".join(sentence for sentence, _ in final_summary)

    return summary_text


summary = summarizer_function(processed_balzac, percentage=20)

In [12]:
summary

'de Dey son kith kin world human earth bind fear hope joy life late Comte Dey race wife sole heiress descendant house worldly ambition family consideration noble craving soul combine heighten Countess sentiment strong woman heart child dearer infinite care succeed rear man estate medical science predict death score time hold presentiment hope know inexpressible joy watch pass peril infancy see constitution strengthen spite decree Faculty thank constant care boy grow develop year age regard accomplished gentleman Court Versailles final happiness crown mother effort son worship deep sympathy kindred soul bind natural sacred tie feel friendship meet man age young Count receive appointment sub - lieutenant regiment dragoon point honor follow emigrant Princes exile Mme Dey face danger cruel position rich noble mother Emigrant desire look son great fortune deny happiness read rigorous law virtue Republic confiscate property Emigrants Carentan congratulate courageous course take keep watch we