In [46]:
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [47]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [48]:
def preprocess_text(text):
    """
    Preprocess a given text by tokenizing, removing stop words, and lemmatizing the words.
    """
    # tokenize the text into sentences
    sentences = sent_tokenize(text)

    # remove stop words and lemmatize the words in each sentence
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        preprocessed_sentence = " ".join(filtered_words)
        preprocessed_sentences.append(preprocessed_sentence)

    return preprocessed_sentences

In [49]:
def compute_similarity(sentence1, sentence2):
    """
    Compute the similarity score between two sentences using TF-IDF.
    """
    tfidf = TfidfVectorizer().fit_transform([sentence1, sentence2])
    similarity_score = (tfidf * tfidf.T).A[0, 1]
    return similarity_score

In [50]:
def find_minimum_cds(graph):
    """
    Find the minimum Connected Dominating Set (CDS) of a graph using a greedy algorithm.
    """
    cds = set() # initialize CDS to empty set
    nodes = set(graph.nodes()) # get all nodes in the graph

    while nodes:
        max_degree_node = max(nodes, key=lambda n: graph.degree(n)) # find node with highest degree
        cds.add(max_degree_node) # add node to CDS
        nodes.discard(max_degree_node) # remove node from remaining nodes
        neighbors = set(graph.neighbors(max_degree_node)) # get all neighbors of the node
        nodes.difference_update(neighbors) # remove neighbors from remaining nodes

    return cds

In [51]:
def summarize_text(text, summary_size, threshold=0.1):
    """
    Summarize a given text using minimum Connected Dominating Set (CDS).
    """
    # preprocess the text
    preprocessed_sentences = preprocess_text(text)

    # create graph from preprocessed sentences
    graph = nx.Graph()
    for i, sentence in enumerate(preprocessed_sentences):
        for j in range(i+1, len(preprocessed_sentences)):
            similarity_score = compute_similarity(sentence, preprocessed_sentences[j]) # compute similarity score between two sentences
            if similarity_score > threshold:
                graph.add_edge(i, j, weight=similarity_score)

    # find minimum CDS of the graph
    cds = find_minimum_cds(graph)

    # sort the CDS nodes based on their occurrence order in the original text
    summary_nodes = sorted(list(cds))

    # create summary by concatenating the selected sentences
    summary = ". ".join([sent_tokenize(text)[i] for i in summary_nodes][:summary_size])

    return summary

In [52]:
text = """
Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.
The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe. The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.
“These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement. “We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe.”
The telescope observes the universe in infrared light, which is invisible to the human eye, and is capable of detecting the faint light from ancient stars and galaxies. By peering into the distant universe, the observatory can essentially see back in time up to about 13.5 billion years ago. (Scientists have determined the universe is about 13.7 billion years old.)
"""

summary_size = 3 # number of sentences in the summary
summary = summarize_text(text, summary_size)

print(summary)


Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.. The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe.
