In [1]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
text = """
Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.
The space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe. The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.
“These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement. “We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe.”
The telescope observes the universe in infrared light, which is invisible to the human eye, and is capable of detecting the faint light from ancient stars and galaxies. By peering into the distant universe, the observatory can essentially see back in time up to about 13.5 billion years ago. (Scientists have determined the universe is about 13.7 billion years old.)
"""

In [3]:
# Split paragraph into sentences
sentences = text.split('. ')

# Store each sentence as a separate document in the array
documents = []
for sentence in sentences:
    documents.append(sentence.strip())

In [4]:
documents

['Astronomers have used the James Webb Space Telescope to peer back in time to the early days of the universe — and they spotted something unexpected.\nThe space observatory revealed six massive galaxies that existed between 500 million and 700 million years after the big bang that created the universe',
 'The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.\n“These objects are way more massive\u200b than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statement',
 '“We expected only to find tiny, young, baby galaxies at this point in time, but we’ve discovered galaxies as mature as our own in what was previously understood to be the dawn of the universe.”\nThe telescope observes the universe in infrared light, which is invisible to the human eye, and is capable of detecting the faint light from ancient st

In [11]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Create document-term matrix
doc_term_matrix = vectorizer.fit_transform(documents)

# Perform K-means clustering
k = 2
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, verbose=False)
km.fit(doc_term_matrix)

KMeans(max_iter=100, n_clusters=2, n_init=1, verbose=False)

In [14]:
# Get cluster labels and centroids
labels = km.labels_
centroids = km.cluster_centers_

# Get representative sentences for each cluster
representative_sentences = []
for i in range(k):
    cluster_indices = np.where(labels == i)[0]
    cluster_sentences = [documents[idx] for idx in cluster_indices]
    cluster_vector = vectorizer.transform(cluster_sentences)
    similarity_scores = np.asarray(cluster_vector.dot(centroids[i].T)).flatten()
    threshold = np.percentile(similarity_scores, 80) # filter out non-representative sentences
    representative_idx = np.argmax(similarity_scores * (similarity_scores > threshold))
    representative_sentence = cluster_sentences[representative_idx]
    representative_sentences.append(representative_sentence)

In [20]:
def listToString(s):
    str1 = ""
    for ele in s:
        str1 += ele
    return str1

In [21]:
# Post-processing: remove redundant sentences
final_summary = list(set(representative_sentences))

# Print the resulting summary
print(listToString(final_summary))

The discovery is completely upending existing theories about the origins of galaxies, according to a new study published Wednesday in the journal Nature.
“These objects are way more massive​ than anyone expected,” said study coauthor Joel Leja, assistant professor of astronomy and astrophysics at Penn State University, in a statementBy peering into the distant universe, the observatory can essentially see back in time up to about 13.5 billion years ago
